cross_validate_with_fine_tuning.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 29 13:58:23 2020
  5. @author: tanya
  6. @description:
  7. * Input:
  8. - pipeline/hyperparameter space
  9. - data_train
  10. - cv
  11. - cv_folds
  12. * For each pipeline:
  13. -> Split data_train into folds according to cv
  14. -> For each fold:
  15. => get data_train_fold, data_test_fold, cv_fold
  16. => split data_train_fold into subfolds according to cv_fold
  17. => For each subfold:
  18. ==> get data_train_subfold, data_test_subfold
  19. ==> train pipeline on data_train_subfold
  20. ==> find best_threshold_subfold on data_test_subfold
  21. => Find averaged_threshold_fold averaged over best_threshold_subfold
  22. => train pipeline on data_train_fold
  23. => find score_fold on data_test_fold with proba_threshold_fold
  24. => find best_threshold_fold on data_test_fold
  25. -> find score averaged over score_fold
  26. -> find averaged_threshold averaged over best_threshold_fold
  27. * choose (pipeline/hyperparameters, threshold) in the space with best score
  28. """
  29. import sys
  30. import pandas as pd
  31. import numpy as np
  32. from itertools import zip_longest
  33. if sys.version_info >= (3, 8):
  34. from typing import Callable, Dict, Iterable, Union
  35. else:
  36. from typing_extensions import Callable, Dict, Iterable, Union
  37. from copy import deepcopy
  38. from sklearn.model_selection import StratifiedKFold
  39. from cdplib.log import Log
  40. from cdplib.ml_validation.CVComposer import CVComposer
  41. # TODO: write with yield !!!!
  42. def get_optimal_proba_threshold(score_func: Callable,
  43. y_true: Union[pd.Series, np.ndarray],
  44. proba: Union[pd.Series, np.ndarray],
  45. threshold_set: Union[Iterable, None] = None):
  46. """
  47. """
  48. scores = {}
  49. if threshold_set is None:
  50. threshold_set = np.arange(0, 1, 0.1)
  51. for threshold in threshold_set:
  52. y_pred = (proba >= threshold).astype(int)
  53. scores[threshold] = score_func(y_true, y_pred)
  54. return max(scores, key=scores.get)
  55. def cross_validate_with_optimal_threshold(
  56. estimator: object,
  57. score_func: Callable,
  58. X_train: Union[pd.DataFrame, np.ndarray],
  59. y_train: Union[pd.Series, np.ndarray, None] = None,
  60. X_val: Union[pd.DataFrame, np.ndarray, None] = None,
  61. y_val: Union[pd.Series, np.ndarray, None] = None,
  62. X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
  63. y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
  64. cv: Union[Iterable, int, None] = None,
  65. cv_threshold: Union[Iterable, int, None] = None,
  66. additional_metrics: Union[Dict[str, Callable], None] = None,
  67. threshold_set: Union[Iterable, None] = None,
  68. scores: Dict = None)\
  69. -> Dict:
  70. """
  71. """
  72. logger = Log("cross_validate_with_optimal_threshold:")
  73. X_train = deepcopy(X_train)
  74. y_train = deepcopy(y_train)
  75. X_val = deepcopy(X_val)
  76. y_val = deepcopy(y_val)
  77. X_val_threshold = deepcopy(X_val_threshold)
  78. y_val_threshold = deepcopy(y_val_threshold)
  79. scores = scores or {"test_threshold": [],
  80. "test_score": [],
  81. "train_score": []}
  82. additional_metrics = additional_metrics or {}
  83. for metric_name, metric in additional_metrics.items():
  84. if "test_" + metric_name not in scores:
  85. scores["test_" + metric_name] = []
  86. scores["train_" + metric_name] = []
  87. if cv is None:
  88. # test score is calculated on X_vals
  89. assert((X_val is not None) and (y_val is not None)),\
  90. "Validation set must be set"
  91. if cv_threshold is None:
  92. refit = (X_val_threshold is not None)
  93. # if a validation set for proba threshold tuning is not given,
  94. # we use the validation set on which we calculate the test score
  95. # (this might lead to overfitting)
  96. X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
  97. y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
  98. cv_threshold, X_train, y_train =\
  99. CVComposer().dummy_cv_and_concatenated_data_set(
  100. X_train=X_train,
  101. X_test=X_val_threshold,
  102. y_train=y_train,
  103. y_test=y_val_threshold)
  104. else:
  105. # if cv_threshold is given, we find the optimal threshold
  106. # on each fold and output the average value for the threshold
  107. if (X_val_threshold is not None):
  108. logger.log_and_throw_warning((
  109. "X_val_threshold is set "
  110. "but cv_threshold will be used"))
  111. if isinstance(cv_threshold, int):
  112. cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
  113. .split(X=X_train, y=y_train)
  114. refit = True
  115. thresholds = []
  116. for train_inds, val_inds in cv_threshold:
  117. print("----- In cv threshold fold")
  118. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  119. CVComposer().cv_slice_dataset(
  120. X=X_train,
  121. y=y_train,
  122. train_inds=train_inds,
  123. test_inds=val_inds)
  124. estimator.fit(X_train_fold, y_train_fold)
  125. proba_val = estimator.predict_proba(X_val_fold)[:, 1]
  126. threshold = get_optimal_proba_threshold(score_func=score_func,
  127. y_true=y_val_fold,
  128. proba=proba_val)
  129. thresholds.append(threshold)
  130. print("----- Threshold:", threshold)
  131. scores["test_threshold"].append(np.mean(thresholds))
  132. if refit:
  133. estimator.fit(X_train, y_train)
  134. proba_val = estimator.predict_proba(X_val)[:, 1]
  135. proba_train = estimator.predict_proba(X_train)[:, 1]
  136. pred_train = (proba_train >= threshold)
  137. pred_val = (proba_val >= threshold)
  138. train_score = score_func(y_train, pred_train)
  139. test_score = score_func(y_val, pred_val)
  140. for metric_name, metric in additional_metrics.items():
  141. scores["train_" + metric_name].append(metric(y_train, pred_train))
  142. scores["test_" + metric_name].append(metric(y_val, pred_val))
  143. scores["train_score"].append(train_score)
  144. scores["test_score"].append(test_score)
  145. return scores
  146. else:
  147. if isinstance(cv, int):
  148. cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
  149. cv_threshold = cv_threshold or []
  150. for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
  151. print("=== In cv fold")
  152. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  153. CVComposer().cv_slice_dataset(
  154. X=X_train,
  155. y=y_train,
  156. train_inds=train_inds,
  157. test_inds=val_inds)
  158. scores = cross_validate_with_optimal_threshold(
  159. estimator=estimator,
  160. score_func=score_func,
  161. X_train=X_train_fold,
  162. y_train=y_train_fold,
  163. X_val=X_val_fold,
  164. y_val=y_val_fold,
  165. cv_threshold=cv_fold,
  166. additional_metrics=additional_metrics,
  167. threshold_set=threshold_set,
  168. scores=scores)
  169. print("=== scores:", scores)
  170. return scores
  171. if __name__ == "__main__":
  172. from sklearn.metrics import accuracy_score, precision_score
  173. from sklearn.datasets import load_breast_cancer
  174. from xgboost import XGBRFClassifier
  175. from sklearn.model_selection import train_test_split
  176. data_loader = load_breast_cancer()
  177. X = data_loader["data"]
  178. y = data_loader["target"]
  179. X_train, X_val, y_train, y_val = train_test_split(X, y)
  180. estimator = XGBRFClassifier()
  181. score_func = accuracy_score
  182. additional_metrics = {"precision": precision_score}
  183. averaged_scores = []
  184. averaged_thresholds = []
  185. print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
  186. scores = cross_validate_with_optimal_threshold(
  187. estimator=estimator,
  188. score_func=accuracy_score,
  189. X_train=X_train,
  190. y_train=y_train,
  191. X_val=X_val,
  192. y_val=y_val,
  193. X_val_threshold=None,
  194. y_val_threshold=None,
  195. cv=None,
  196. cv_threshold=None,
  197. additional_metrics=additional_metrics)
  198. print("\nScores:", scores)
  199. averaged_scores.append(np.mean(scores["test_score"]))
  200. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  201. print("\n ########################################################## \n")
  202. X_train, X_val_threshold, y_train, y_val_threshold =\
  203. train_test_split(X_train, y_train)
  204. print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
  205. scores = cross_validate_with_optimal_threshold(
  206. estimator=estimator,
  207. score_func=accuracy_score,
  208. X_train=X_train,
  209. y_train=y_train,
  210. X_val=X_val,
  211. y_val=y_val,
  212. X_val_threshold=X_val_threshold,
  213. y_val_threshold=y_val_threshold,
  214. cv=None,
  215. cv_threshold=None,
  216. additional_metrics=additional_metrics)
  217. print("\nScores:", scores)
  218. averaged_scores.append(np.mean(scores["test_score"]))
  219. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  220. print("\n ########################################################## \n")
  221. print("\nTesting cv=None, cv_threshold=3 \n")
  222. scores = cross_validate_with_optimal_threshold(
  223. estimator=estimator,
  224. score_func=accuracy_score,
  225. X_train=X_train,
  226. y_train=y_train,
  227. X_val=X_val,
  228. y_val=y_val,
  229. X_val_threshold=X_val_threshold,
  230. y_val_threshold=y_val_threshold,
  231. cv=None,
  232. cv_threshold=3,
  233. additional_metrics=additional_metrics)
  234. print("\nScores:", scores)
  235. averaged_scores.append(np.mean(scores["test_score"]))
  236. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  237. print("\n ########################################################## \n")
  238. print("\nTesting cv=3, cv_threshold=None \n")
  239. scores = cross_validate_with_optimal_threshold(
  240. estimator=estimator,
  241. score_func=accuracy_score,
  242. X_train=X_train,
  243. y_train=y_train,
  244. X_val=X_val,
  245. y_val=y_val,
  246. X_val_threshold=X_val_threshold,
  247. y_val_threshold=y_val_threshold,
  248. cv=3,
  249. cv_threshold=None,
  250. additional_metrics=additional_metrics)
  251. print("\nScores:", scores)
  252. print("\n ########################################################## \n")
  253. print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
  254. scores = cross_validate_with_optimal_threshold(
  255. estimator=estimator,
  256. score_func=accuracy_score,
  257. X_train=X_train,
  258. y_train=y_train,
  259. X_val=X_val,
  260. y_val=y_val,
  261. X_val_threshold=X_val_threshold,
  262. y_val_threshold=y_val_threshold,
  263. cv=3,
  264. cv_threshold=[3, 3, 3],
  265. additional_metrics=additional_metrics)
  266. print("\nScores:", scores)
  267. averaged_scores.append(np.mean(scores["test_score"]))
  268. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  269. print("\n ########################################################## \n")
  270. # TODO: check overwriting X_train,
  271. # additional metrics append instead of overwrite
  272. # check the length of cv_threshold
  273. # test custom cv, cv_threshold
  274. print("\n Averaged test score:", averaged_scores)
  275. print("\n Averaged threshold:", averaged_thresholds)