cross_validate_with_fine_tuning.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 29 13:58:23 2020
  5. @author: tanya
  6. @description:
  7. * Input:
  8. - pipeline/hyperparameter space
  9. - data_train
  10. - cv
  11. - cv_folds
  12. * For each pipeline:
  13. -> Split data_train into folds according to cv
  14. -> For each fold:
  15. => get data_train_fold, data_test_fold, cv_fold
  16. => split data_train_fold into subfolds according to cv_fold
  17. => For each subfold:
  18. ==> get data_train_subfold, data_test_subfold
  19. ==> train pipeline on data_train_subfold
  20. ==> find best_threshold_subfold on data_test_subfold
  21. => Find averaged_threshold_fold averaged over best_threshold_subfold
  22. => train pipeline on data_train_fold
  23. => find score_fold on data_test_fold with proba_threshold_fold
  24. => find best_threshold_fold on data_test_fold
  25. -> find score averaged over score_fold
  26. -> find averaged_threshold averaged over best_threshold_fold
  27. * choose (pipeline/hyperparameters, threshold) in the space with best score
  28. """
  29. import pandas as pd
  30. import numpy as np
  31. from itertools import zip_longest
  32. from typing import Union, Callable, Dict, Iterable, Tuple, List
  33. from copy import deepcopy
  34. from itertools import accumulate, repeat, takewhile, chain
  35. from sklearn.model_selection import StratifiedKFold
  36. from cdplib.log import Log
  37. aa = make_sliding_window_cv(data_set_size=50,
  38. test_proportion=0.1,
  39. train_proportion=0.6,
  40. step_proportion=0.1)
  41. aa = list(aa)
  42. aa = make_sliding_window_cv(test_proportion=0.1,
  43. train_proportion=0.6,
  44. step_proportion=0.05,
  45. index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
  46. aa = list(aa)
  47. # TODO: write with yield !!!!
  48. def make_nested_expanding_cv(
  49. test_proportion: float,
  50. start_train_proportion: float,
  51. step_proportion: float = None,
  52. expanding_test_size: bool = False,
  53. data_set_size: Union[float, None] = None,
  54. index: Union[pd.Series, np.ndarray, list, None] = None)\
  55. -> Iterable[Tuple[List]]:
  56. """
  57. """
  58. logger = Log("make_nested_expanding_cv:")
  59. try:
  60. cv = make_expanding_cv(test_proportion=test_proportion,
  61. start_train_proportion=start_train_proportion,
  62. step_proportion=step_proportion,
  63. expanding_test_size=expanding_test_size,
  64. data_set_size=data_set_size,
  65. index=index)
  66. nested_cv = []
  67. for train_inds, test_inds in cv:
  68. fold_index = train_inds if index is not None\
  69. else None
  70. fold_size = len(train_inds) if index is None else None
  71. fold_cv = make_expanding_cv(
  72. test_proportion=test_proportion,
  73. start_train_proportion=start_train_proportion,
  74. step_proportion=step_proportion,
  75. expanding_test_size=expanding_test_size,
  76. data_set_size=fold_size,
  77. index=fold_index)
  78. nested_cv.append(list(fold_cv))
  79. return nested_cv
  80. except Exception as e:
  81. logger.log_and_raise_error(("Failed to make nested expanding cv. "
  82. "Exit with error: {}".format(e)))
  83. for train_inds, test_inds in aa:
  84. print(len(test_inds)/(len(train_inds) + len(test_inds)))
  85. print(len(test_inds)/50)
  86. aaa = list(aaa)
  87. for aaa_cv in aaa:
  88. for train_inds, test_inds in aaa_cv:
  89. print(len(test_inds)/(len(train_inds) + len(test_inds)))
  90. print(len(test_inds)/50)
  91. aaa = make_nested_expanding_cv(#data_set_size=50,
  92. test_proportion=0.1,
  93. start_train_proportion=0.6,
  94. step_proportion=0.1,
  95. index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
  96. aaa = list(aaa)
  97. def cv_slice_dataset(X, y, train_inds, test_inds)\
  98. -> Tuple[Union[pd.DataFrame, np.ndarray],
  99. Union[pd.Series, np.ndarray]]:
  100. """
  101. """
  102. if isinstance(X, pd.DataFrame):
  103. X_train = X.loc[train_inds]
  104. X_val = X.loc[test_inds]
  105. else:
  106. X_train = X[train_inds]
  107. X_val = X[test_inds]
  108. if y is not None:
  109. y_train = y[train_inds]
  110. y_val = y[test_inds]
  111. return X_train, X_val, y_train, y_val
  112. def get_optimal_proba_threshold(score_func: Callable,
  113. y_true: Union[pd.Series, np.ndarray],
  114. proba: Union[pd.Series, np.ndarray],
  115. threshold_set: Union[Iterable, None] = None):
  116. """
  117. """
  118. scores = {}
  119. if threshold_set is None:
  120. threshold_set = np.arange(0, 1, 0.1)
  121. for threshold in threshold_set:
  122. y_pred = (proba >= threshold).astype(int)
  123. scores[threshold] = score_func(y_true, y_pred)
  124. return max(scores, key=scores.get)
  125. def cross_validate_with_optimal_threshold(
  126. estimator: object,
  127. score_func: Callable,
  128. X_train: Union[pd.DataFrame, np.ndarray],
  129. y_train: Union[pd.Series, np.ndarray, None] = None,
  130. X_val: Union[pd.DataFrame, np.ndarray, None] = None,
  131. y_val: Union[pd.Series, np.ndarray, None] = None,
  132. X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
  133. y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
  134. cv: Union[Iterable, int, None] = None,
  135. cv_threshold: Union[Iterable, int, None] = None,
  136. additional_metrics: Union[Dict[str, Callable], None] = None,
  137. threshold_set: Union[Iterable, None] = None,
  138. scores: Dict = None)\
  139. -> Dict:
  140. """
  141. """
  142. logger = Log("cross_validate_with_optimal_threshold:")
  143. X_train = deepcopy(X_train)
  144. y_train = deepcopy(y_train)
  145. X_val = deepcopy(X_val)
  146. y_val = deepcopy(y_val)
  147. X_val_threshold = deepcopy(X_val_threshold)
  148. y_val_threshold = deepcopy(y_val_threshold)
  149. scores = scores or {"test_threshold": [],
  150. "test_score": [],
  151. "train_score": []}
  152. additional_metrics = additional_metrics or {}
  153. for metric_name, metric in additional_metrics.items():
  154. if "test_" + metric_name not in scores:
  155. scores["test_" + metric_name] = []
  156. scores["train_" + metric_name] = []
  157. if cv is None:
  158. # test score is calculated on X_vals
  159. assert((X_val is not None) and (y_val is not None)),\
  160. "Validation set must be set"
  161. if cv_threshold is None:
  162. refit = (X_val_threshold is not None)
  163. # if a validation set for proba threshold tuning is not given,
  164. # we use the validation set on which we calculate the test score
  165. # (this might lead to overfitting)
  166. X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
  167. y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
  168. cv_threshold, X_train, y_train = make_dummy_cv(
  169. X_train=X_train,
  170. y_train=y_train,
  171. X_val=X_val_threshold,
  172. y_val=y_val_threshold)
  173. else:
  174. # if cv_threshold is given, we find the optimal threshold
  175. # on each fold and output the average value for the threshold
  176. if (X_val_threshold is not None):
  177. logger.log_and_throw_warning((
  178. "X_val_threshold is set "
  179. "but cv_threshold will be used"))
  180. if isinstance(cv_threshold, int):
  181. cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
  182. .split(X=X_train, y=y_train)
  183. refit = True
  184. thresholds = []
  185. for train_inds, val_inds in cv_threshold:
  186. print("----- In cv threshold fold")
  187. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  188. cv_slice_dataset(X=X_train,
  189. y=y_train,
  190. train_inds=train_inds,
  191. test_inds=val_inds)
  192. estimator.fit(X_train_fold, y_train_fold)
  193. proba_val = estimator.predict_proba(X_val_fold)[:, 1]
  194. threshold = get_optimal_proba_threshold(score_func=score_func,
  195. y_true=y_val_fold,
  196. proba=proba_val)
  197. thresholds.append(threshold)
  198. print("----- Threshold:", threshold)
  199. scores["test_threshold"].append(np.mean(thresholds))
  200. if refit:
  201. estimator.fit(X_train, y_train)
  202. proba_val = estimator.predict_proba(X_val)[:, 1]
  203. proba_train = estimator.predict_proba(X_train)[:, 1]
  204. pred_train = (proba_train >= threshold)
  205. pred_val = (proba_val >= threshold)
  206. train_score = score_func(y_train, pred_train)
  207. test_score = score_func(y_val, pred_val)
  208. for metric_name, metric in additional_metrics.items():
  209. scores["train_" + metric_name].append(metric(y_train, pred_train))
  210. scores["test_" + metric_name].append(metric(y_val, pred_val))
  211. scores["train_score"].append(train_score)
  212. scores["test_score"].append(test_score)
  213. return scores
  214. else:
  215. if isinstance(cv, int):
  216. cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
  217. cv_threshold = cv_threshold or []
  218. for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
  219. print("=== In cv fold")
  220. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  221. cv_slice_dataset(X=X_train,
  222. y=y_train,
  223. train_inds=train_inds,
  224. test_inds=val_inds)
  225. scores = cross_validate_with_optimal_threshold(
  226. estimator=estimator,
  227. score_func=score_func,
  228. X_train=X_train_fold,
  229. y_train=y_train_fold,
  230. X_val=X_val_fold,
  231. y_val=y_val_fold,
  232. cv_threshold=cv_fold,
  233. additional_metrics=additional_metrics,
  234. threshold_set=threshold_set,
  235. scores=scores)
  236. print("=== scores:", scores)
  237. return scores
  238. if __name__ == "__main__":
  239. from sklearn.metrics import accuracy_score, precision_score
  240. from sklearn.datasets import load_breast_cancer
  241. from xgboost import XGBRFClassifier
  242. from sklearn.model_selection import train_test_split
  243. data_loader = load_breast_cancer()
  244. X = data_loader["data"]
  245. y = data_loader["target"]
  246. X_train, X_val, y_train, y_val = train_test_split(X, y)
  247. estimator = XGBRFClassifier()
  248. score_func = accuracy_score
  249. additional_metrics = {"precision": precision_score}
  250. averaged_scores = []
  251. averaged_thresholds = []
  252. print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
  253. scores = cross_validate_with_optimal_threshold(
  254. estimator=estimator,
  255. score_func=accuracy_score,
  256. X_train=X_train,
  257. y_train=y_train,
  258. X_val=X_val,
  259. y_val=y_val,
  260. X_val_threshold=None,
  261. y_val_threshold=None,
  262. cv=None,
  263. cv_threshold=None,
  264. additional_metrics=additional_metrics)
  265. print("\nScores:", scores)
  266. averaged_scores.append(np.mean(scores["test_score"]))
  267. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  268. print("\n ########################################################## \n")
  269. X_train, X_val_threshold, y_train, y_val_threshold =\
  270. train_test_split(X_train, y_train)
  271. print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
  272. scores = cross_validate_with_optimal_threshold(
  273. estimator=estimator,
  274. score_func=accuracy_score,
  275. X_train=X_train,
  276. y_train=y_train,
  277. X_val=X_val,
  278. y_val=y_val,
  279. X_val_threshold=X_val_threshold,
  280. y_val_threshold=y_val_threshold,
  281. cv=None,
  282. cv_threshold=None,
  283. additional_metrics=additional_metrics)
  284. print("\nScores:", scores)
  285. averaged_scores.append(np.mean(scores["test_score"]))
  286. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  287. print("\n ########################################################## \n")
  288. print("\nTesting cv=None, cv_threshold=3 \n")
  289. scores = cross_validate_with_optimal_threshold(
  290. estimator=estimator,
  291. score_func=accuracy_score,
  292. X_train=X_train,
  293. y_train=y_train,
  294. X_val=X_val,
  295. y_val=y_val,
  296. X_val_threshold=X_val_threshold,
  297. y_val_threshold=y_val_threshold,
  298. cv=None,
  299. cv_threshold=3,
  300. additional_metrics=additional_metrics)
  301. print("\nScores:", scores)
  302. averaged_scores.append(np.mean(scores["test_score"]))
  303. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  304. print("\n ########################################################## \n")
  305. print("\nTesting cv=3, cv_threshold=None \n")
  306. scores = cross_validate_with_optimal_threshold(
  307. estimator=estimator,
  308. score_func=accuracy_score,
  309. X_train=X_train,
  310. y_train=y_train,
  311. X_val=X_val,
  312. y_val=y_val,
  313. X_val_threshold=X_val_threshold,
  314. y_val_threshold=y_val_threshold,
  315. cv=3,
  316. cv_threshold=None,
  317. additional_metrics=additional_metrics)
  318. print("\nScores:", scores)
  319. print("\n ########################################################## \n")
  320. print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
  321. scores = cross_validate_with_optimal_threshold(
  322. estimator=estimator,
  323. score_func=accuracy_score,
  324. X_train=X_train,
  325. y_train=y_train,
  326. X_val=X_val,
  327. y_val=y_val,
  328. X_val_threshold=X_val_threshold,
  329. y_val_threshold=y_val_threshold,
  330. cv=3,
  331. cv_threshold=[3, 3, 3],
  332. additional_metrics=additional_metrics)
  333. print("\nScores:", scores)
  334. averaged_scores.append(np.mean(scores["test_score"]))
  335. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  336. print("\n ########################################################## \n")
  337. # TODO: check overwriting X_train,
  338. # additional metrics append instead of overwrite
  339. # check the length of cv_threshold
  340. # test custom cv, cv_threshold
  341. print("\n Averaged test score:", averaged_scores)
  342. print("\n Averaged threshold:", averaged_thresholds)