cross_validate_with_fine_tuning.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 29 13:58:23 2020
  5. @author: tanya
  6. @description:
  7. scenario 1:
  8. You have a train set and a validation set and you tune the probability
  9. threshold on the validation set:
  10. X = X_train,
  11. y = y_train,
  12. X_val = X_val,
  13. y_val = y_val
  14. and you set to None:
  15. cv = None,
  16. X_val_threshold = None,
  17. y_val_threshold = None,
  18. cv_threshold = None
  19. Downsides:
  20. 1) You return a single validation score
  21. (cross validation score would be more robust)
  22. 2) You fine tune on the same validation set as you calculate
  23. the score on. Using an independent validation data set for fine tuning would be
  24. more robust
  25. 3) You fine tune the probability threshold on a single dataset.
  26. It would be more robust to tune on several independent datasets
  27. and take the average probability threshold.
  28. scenario 2:
  29. You have a train set and a validation set and you tune the probability
  30. threshold an independent set. You need to pass the independent data set
  31. to the X_val_threshold and y_val_threshold parameter
  32. X = X_train,
  33. y = y_train,
  34. X_val = X_val,
  35. y_val = y_val,
  36. X_val_thresold = X_val_indpendent,
  37. y_val_threshold = y_val_independent
  38. and you set to None:
  39. cv = None,
  40. cv_threshold = None
  41. Downsides:
  42. 1) You return a single validation score
  43. (cross validation score would be more robust)
  44. 2) You fine tune the probability threshold on a single dataset.
  45. It would be more robust to tune on several independent datasets
  46. and take the average probability threshold.
  47. scenario 3:
  48. You have a dataset on which you want to calculate the cross-validation
  49. score and a cv object. You fine tune the probability threshold on each fold,
  50. using the validation part of the fold.
  51. X = X_train,
  52. y = y_train,
  53. cv = cv
  54. and you set to None:
  55. X_val = None,
  56. y_val = None,
  57. X_val_thresold = None,
  58. y_val_threshold = None
  59. cv_threshold = None
  60. Downsides:
  61. 2) In each fold, you fine tune on the same validation set as you calculate
  62. the score on. Using an independent validation data set for fine tuning would be
  63. more robust
  64. 3) In each fold, you fine tune the probability threshold on a single dataset.
  65. It would be more robust to tune on several independent datasets
  66. and take the average probability threshold.
  67. scenario 4:
  68. You have a dataset on which you want to calculate the cross-validation
  69. score and a cv object. You fine tune the probability threshold on independent
  70. dataset (or multiple datasets) in each fold.
  71. You need to have a cv_threshold object that tells you have to
  72. split each of the folds of you cv.
  73. Example 1:
  74. cv = [((1, 2, 3, 4), (5, 6, 7)),
  75. ((5, 6, 7, 8), (9, 10))]
  76. cv_threshold = [ [(1,2), (3, 4)],
  77. [(5, 6), (7, 8)]
  78. ]
  79. Example 2:
  80. cv = 3
  81. cv_threshold = [4, 4, 4]
  82. Example 3:
  83. cv = [((1, 2, 3, 4, 5, 6), (7, 8, 9)),
  84. ((5, 6, 7, 8), (9, 10))]
  85. cv_threshold = [ [((1, 2), (3, 4, 5)),
  86. ((2, 3), (4, 5, 6))
  87. ]
  88. ]
  89. #####################
  90. X = X_train,
  91. y = y_train,
  92. cv = cv,
  93. cv_threshold = cv_threshold
  94. and you set to None:
  95. X_val = None,
  96. y_val = None,
  97. X_val_thresold = None,
  98. y_val_threshold = None
  99. Downsides:
  100. 2) In each fold, you fine tune on the same validation set as you calculate
  101. the score on. Using an independent validation data set for fine tuning would be
  102. more robust
  103. 3) In each fold, you fine tune the probability threshold on a single dataset.
  104. It would be more robust to tune on several independent datasets
  105. and take the average probability threshold.
  106. """
  107. import sys
  108. import numpy as np
  109. from itertools import zip_longest
  110. from numpy.typing import ArrayLike
  111. if sys.version_info >= (3, 8):
  112. from typing import Callable, Dict, Iterable, Union
  113. else:
  114. from typing_extensions import Callable, Dict, Iterable, Union
  115. from copy import deepcopy
  116. from sklearn.model_selection import StratifiedKFold
  117. from cdplib.log import Log
  118. from cdplib.ml_validation.CVComposer import CVComposer
  119. from cdplib.fine_tuning import get_optimal_proba_threshold
  120. # TODO: write with yield !!!!
  121. def cross_validate_with_optimal_threshold(
  122. score_func_threshold: Callable,
  123. estimator: object,
  124. X: ArrayLike,
  125. y: ArrayLike = None,
  126. groups: ArrayLike = None,
  127. scoring: Union[Callable, Dict] = None,
  128. cv: Union[Iterable, int, None] = None,
  129. n_jobs: int = None,
  130. verbose: int = None,
  131. fit_params: Dict = None,
  132. pre_dispatch: int = None,
  133. return_train_score: bool = False,
  134. return_estimator: bool = False,
  135. error_score: float = np.nan,
  136. X_val: ArrayLike = None,
  137. y_val: ArrayLike = None,
  138. X_val_threshold: ArrayLike = None,
  139. y_val_threshold: ArrayLike = None,
  140. cv_threshold: Union[Iterable, int, None] = None,
  141. threshold_set: Union[Iterable, None] = None,
  142. scores: Dict = None)-> Dict:
  143. """
  144. """
  145. logger = Log("cross_validate_with_optimal_threshold:")
  146. X_train = deepcopy(X)
  147. y_train = deepcopy(y)
  148. X_val = deepcopy(X_val)
  149. y_val = deepcopy(y_val)
  150. X_val_threshold = deepcopy(X_val_threshold)
  151. y_val_threshold = deepcopy(y_val_threshold)
  152. scores = scores or {"test_threshold": [],
  153. "test_score_threshold": [],
  154. "train_score_threshold": []}
  155. scoring = scoring or {}
  156. for metric_name, metric in scoring.items():
  157. if "test_" + metric_name not in scores:
  158. scores["test_" + metric_name] = []
  159. scores["train_" + metric_name] = []
  160. if cv is None:
  161. # test score is calculated on X_vals
  162. assert((X_val is not None) and (y_val is not None)),\
  163. "Validation set must be set"
  164. if cv_threshold is None:
  165. refit = (X_val_threshold is not None)
  166. # if a validation set for proba threshold tuning is not given,
  167. # we use the validation set on which we calculate the test score
  168. # (this might lead to overfitting)
  169. X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
  170. y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
  171. cv_threshold, X_train, y_train =\
  172. CVComposer().dummy_cv_and_concatenated_data_set(
  173. X_train=X_train,
  174. X_test=X_val_threshold,
  175. y_train=y_train,
  176. y_test=y_val_threshold)
  177. else:
  178. # if cv_threshold is given, we find the optimal threshold
  179. # on each fold and output the average value for the threshold
  180. if (X_val_threshold is not None):
  181. logger.log_and_throw_warning((
  182. "X_val_threshold is set "
  183. "but cv_threshold will be used"))
  184. if isinstance(cv_threshold, int):
  185. cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
  186. .split(X=X_train, y=y_train)
  187. refit = True
  188. thresholds = []
  189. for train_inds, val_inds in cv_threshold:
  190. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  191. CVComposer().cv_slice_dataset(
  192. X=X_train,
  193. y=y_train,
  194. train_inds=train_inds,
  195. test_inds=val_inds)
  196. estimator.fit(X_train_fold, y_train_fold)
  197. proba_val = estimator.predict_proba(X_val_fold)[:, 1]
  198. threshold = get_optimal_proba_threshold(
  199. score_func=score_func_threshold,
  200. y_true=y_val_fold,
  201. proba=proba_val)
  202. thresholds.append(threshold)
  203. scores["test_threshold"].append(np.mean(thresholds))
  204. if refit:
  205. estimator.fit(X_train, y_train)
  206. proba_val = estimator.predict_proba(X_val)[:, 1]
  207. proba_train = estimator.predict_proba(X_train)[:, 1]
  208. pred_train = (proba_train >= threshold)
  209. pred_val = (proba_val >= threshold)
  210. train_score = score_func_threshold(y_train, pred_train)
  211. test_score = score_func_threshold(y_val, pred_val)
  212. for metric_name, metric in scoring.items():
  213. scores["train_" + metric_name].append(metric(y_train, pred_train))
  214. scores["test_" + metric_name].append(metric(y_val, pred_val))
  215. scores["train_score_threshold"].append(train_score)
  216. scores["test_score_threshold"].append(test_score)
  217. return scores
  218. else:
  219. if isinstance(cv, int):
  220. cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
  221. cv_threshold = cv_threshold or []
  222. for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
  223. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  224. CVComposer().cv_slice_dataset(
  225. X=X_train,
  226. y=y_train,
  227. train_inds=train_inds,
  228. test_inds=val_inds)
  229. scores = cross_validate_with_optimal_threshold(
  230. estimator=estimator,
  231. score_func_threshold=score_func_threshold,
  232. X=X_train_fold,
  233. y=y_train_fold,
  234. X_val=X_val_fold,
  235. y_val=y_val_fold,
  236. cv_threshold=cv_fold,
  237. scoring=scoring,
  238. threshold_set=threshold_set,
  239. scores=scores)
  240. return scores
  241. if __name__ == "__main__":
  242. from sklearn.metrics import accuracy_score, precision_score
  243. from sklearn.datasets import load_breast_cancer
  244. from xgboost import XGBRFClassifier
  245. from sklearn.model_selection import train_test_split
  246. data_loader = load_breast_cancer()
  247. X = data_loader["data"]
  248. y = data_loader["target"]
  249. X_train, X_val, y_train, y_val = train_test_split(X, y)
  250. estimator = XGBRFClassifier(use_label_encoder=False,
  251. eval_metric="logloss")
  252. score_func = accuracy_score
  253. scoring = {"precision": precision_score}
  254. averaged_scores = []
  255. averaged_thresholds = []
  256. print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
  257. scores = cross_validate_with_optimal_threshold(
  258. score_func_threshold=accuracy_score,
  259. estimator=estimator,
  260. X=X_train,
  261. y=y_train,
  262. scoring=scoring,
  263. cv=None,
  264. X_val=X_val,
  265. y_val=y_val,
  266. X_val_threshold=None,
  267. y_val_threshold=None,
  268. cv_threshold=None)
  269. print("\nScores:", scores)
  270. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  271. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  272. print("\n ########################################################## \n")
  273. X_train, X_val_threshold, y_train, y_val_threshold =\
  274. train_test_split(X_train, y_train)
  275. print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
  276. scores = cross_validate_with_optimal_threshold(
  277. score_func_threshold=accuracy_score,
  278. estimator=estimator,
  279. X=X_train,
  280. y=y_train,
  281. scoring=scoring,
  282. cv=None,
  283. X_val=X_val,
  284. y_val=y_val,
  285. X_val_threshold=X_val_threshold,
  286. y_val_threshold=y_val_threshold,
  287. cv_threshold=None)
  288. print("\nScores:", scores)
  289. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  290. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  291. print("\n ########################################################## \n")
  292. print("\nTesting cv=None, cv_threshold=3 \n")
  293. scores = cross_validate_with_optimal_threshold(
  294. score_func_threshold=accuracy_score,
  295. estimator=estimator,
  296. X=X_train,
  297. y=y_train,
  298. scoring=scoring,
  299. cv=None,
  300. X_val=X_val,
  301. y_val=y_val,
  302. X_val_threshold=X_val_threshold,
  303. y_val_threshold=y_val_threshold,
  304. cv_threshold=3)
  305. print("\nScores:", scores)
  306. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  307. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  308. print("\n ########################################################## \n")
  309. print("\nTesting cv=3, cv_threshold=None \n")
  310. scores = cross_validate_with_optimal_threshold(
  311. score_func_threshold=accuracy_score,
  312. estimator=estimator,
  313. X=X_train,
  314. y=y_train,
  315. scoring=scoring,
  316. cv=3,
  317. X_val=None,
  318. y_val=None,
  319. X_val_threshold=None,
  320. y_val_threshold=None,
  321. cv_threshold=None)
  322. print("\nScores:", scores)
  323. print("\n ########################################################## \n")
  324. print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
  325. scores = cross_validate_with_optimal_threshold(
  326. score_func_threshold=accuracy_score,
  327. estimator=estimator,
  328. X=X_train,
  329. y=y_train,
  330. scoring=scoring,
  331. cv=3,
  332. X_val=X_val,
  333. y_val=y_val,
  334. X_val_threshold=X_val_threshold,
  335. y_val_threshold=y_val_threshold,
  336. cv_threshold=[3, 3, 3])
  337. print("\nScores:", scores)
  338. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  339. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  340. print("\n ########################################################## \n")
  341. # TODO: check overwriting X_train,
  342. # additional metrics append instead of overwrite
  343. # check the length of cv_threshold
  344. # test custom cv, cv_threshold
  345. print("\n Averaged test score:", averaged_scores)
  346. print("\n Averaged threshold:", averaged_thresholds)