cross_validate_with_fine_tuning.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Thu Oct 29 13:58:23 2020
  5. @author: tanya
  6. @description:
  7. scenario 1:
  8. You have a train set and a validation set and you tune the probability
  9. threshold on the validation set:
  10. X = X_train,
  11. y = y_train,
  12. X_val = X_val,
  13. y_val = y_val
  14. and you set to None:
  15. cv = None,
  16. X_val_threshold = None,
  17. y_val_threshold = None,
  18. cv_threshold = None
  19. Downsides:
  20. 1) You return a single validation score
  21. (cross validation score would be more robust)
  22. 2) You fine tune on the same validation set as you calculate
  23. the score on. Using an independent validation data set for fine tuning would be
  24. more robust
  25. 3) You fine tune the probability threshold on a single dataset.
  26. It would be more robust to tune on several independent datasets
  27. and take the average probability threshold.
  28. scenario 2:
  29. You have a train set and a validation set and you tune the probability
  30. threshold an independent set. You need to pass the independent data set
  31. to the X_val_threshold and y_val_threshold parameter
  32. X = X_train,
  33. y = y_train,
  34. X_val = X_val,
  35. y_val = y_val,
  36. X_val_thresold = X_val_indpendent,
  37. y_val_threshold = y_val_independent
  38. and you set to None:
  39. cv = None,
  40. cv_threshold = None
  41. Downsides:
  42. 1) You return a single validation score
  43. (cross validation score would be more robust)
  44. 2) You fine tune the probability threshold on a single dataset.
  45. It would be more robust to tune on several independent datasets
  46. and take the average probability threshold.
  47. scenario 3:
  48. You have a dataset on which you want to calculate the cross-validation
  49. score and a cv object. You fine tune the probability threshold on each fold,
  50. using the validation part of the fold.
  51. X = X_train,
  52. y = y_train,
  53. cv = cv
  54. and you set to None:
  55. X_val = None,
  56. y_val = None,
  57. X_val_thresold = None,
  58. y_val_threshold = None
  59. cv_threshold = None
  60. Downsides:
  61. 2) In each fold, you fine tune on the same validation set as you calculate
  62. the score on. Using an independent validation data set for fine tuning would be
  63. more robust
  64. 3) In each fold, you fine tune the probability threshold on a single dataset.
  65. It would be more robust to tune on several independent datasets
  66. and take the average probability threshold.
  67. scenario 4:
  68. You have a dataset on which you want to calculate the cross-validation
  69. score and a cv object. You fine tune the probability threshold on independent
  70. dataset (or multiple datasets) in each fold.
  71. You need to have a cv_threshold object that tells you have to
  72. split each of the folds of you cv.
  73. Example 1:
  74. cv = [((1, 2, 3, 4), (5, 6, 7)),
  75. ((5, 6, 7, 8), (9, 10))]
  76. cv_threshold = [ [(1,2), (3, 4)],
  77. [(5, 6), (7, 8)]
  78. ]
  79. Example 2:
  80. cv = 3
  81. cv_threshold = [4, 4, 4]
  82. Example 3:
  83. cv = [((1, 2, 3, 4, 5, 6), (7, 8, 9))]
  84. cv_threshold = [ [((1, 2), (3, 4, 5)),
  85. ((2, 3), (4, 5, 6))
  86. ]
  87. ]
  88. #####################
  89. X = X_train,
  90. y = y_train,
  91. cv = cv,
  92. cv_threshold = cv_threshold
  93. and you set to None:
  94. X_val = None,
  95. y_val = None,
  96. X_val_thresold = None,
  97. y_val_threshold = None
  98. Downsides:
  99. 2) In each fold, you fine tune on the same validation set as you calculate
  100. the score on. Using an independent validation data set for fine tuning would be
  101. more robust
  102. 3) In each fold, you fine tune the probability threshold on a single dataset.
  103. It would be more robust to tune on several independent datasets
  104. and take the average probability threshold.
  105. """
  106. import sys
  107. import numpy as np
  108. from itertools import zip_longest
  109. # from numpy.typing import ArrayLike
  110. if (sys.version_info.major == 3) & (sys.version_info.minor >= 8):
  111. from typing import Callable, Dict, Iterable, Union
  112. else:
  113. from typing_extensions import Callable, Dict, Iterable, Union
  114. from copy import deepcopy
  115. from sklearn.model_selection import StratifiedKFold
  116. from cdplib.log import Log
  117. from cdplib.ml_validation.CVComposer import CVComposer
  118. from cdplib.ml_validation.fine_tuning import get_optimal_proba_threshold
  119. def cross_validate_with_optimal_threshold(
  120. score_func_threshold: Callable,
  121. estimator: object,
  122. # X: ArrayLike,
  123. # y: ArrayLike = None,
  124. # groups: ArrayLike = None,
  125. X,
  126. y = None,
  127. groups = None,
  128. scoring: Union[Callable, Dict] = None,
  129. cv: Union[Iterable, int, None] = None,
  130. n_jobs: int = None,
  131. verbose: int = None,
  132. fit_params: Dict = None,
  133. pre_dispatch: int = None,
  134. return_train_score: bool = False,
  135. return_estimator: bool = False,
  136. error_score: float = np.nan,
  137. # X_val: ArrayLike = None,
  138. # y_val: ArrayLike = None,
  139. # X_val_threshold: ArrayLike = None,
  140. # y_val_threshold: ArrayLike = None,
  141. X_val = None,
  142. y_val = None,
  143. X_val_threshold = None,
  144. y_val_threshold = None,
  145. cv_threshold: Union[Iterable, int, None] = None,
  146. threshold_set: Union[Iterable, None] = None,
  147. scores: Dict = None)-> Dict:
  148. """
  149. """
  150. logger = Log("cross_validate_with_optimal_threshold:")
  151. X_train = deepcopy(X)
  152. y_train = deepcopy(y)
  153. X_val = deepcopy(X_val)
  154. y_val = deepcopy(y_val)
  155. X_val_threshold = deepcopy(X_val_threshold)
  156. y_val_threshold = deepcopy(y_val_threshold)
  157. scores = scores or {"test_threshold": [],
  158. "test_score_threshold": [],
  159. "train_score_threshold": []}
  160. scoring = scoring or {}
  161. for metric_name, metric in scoring.items():
  162. if "test_" + metric_name not in scores:
  163. scores["test_" + metric_name] = []
  164. scores["train_" + metric_name] = []
  165. if cv is None:
  166. # test score is calculated on X_vals
  167. assert((X_val is not None) and (y_val is not None)),\
  168. "Validation set must be set"
  169. if cv_threshold is None:
  170. refit = (X_val_threshold is not None)
  171. # if a validation set for proba threshold tuning is not given,
  172. # we use the validation set on which we calculate the test score
  173. # (this might lead to overfitting)
  174. X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
  175. y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
  176. cv_threshold, X_train, y_train =\
  177. CVComposer().dummy_cv_and_concatenated_data_set(
  178. X_train=X_train,
  179. X_test=X_val_threshold,
  180. y_train=y_train,
  181. y_test=y_val_threshold)
  182. else:
  183. # if cv_threshold is given, we find the optimal threshold
  184. # on each fold and output the average value for the threshold
  185. if (X_val_threshold is not None):
  186. logger.log_and_throw_warning((
  187. "X_val_threshold is set "
  188. "but cv_threshold will be used"))
  189. if isinstance(cv_threshold, int):
  190. cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
  191. .split(X=X_train, y=y_train)
  192. refit = True
  193. thresholds = []
  194. for train_inds, val_inds in cv_threshold:
  195. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  196. CVComposer().cv_slice_dataset(
  197. X=X_train,
  198. y=y_train,
  199. train_inds=train_inds,
  200. test_inds=val_inds)
  201. estimator.fit(X_train_fold, y_train_fold)
  202. proba_val = estimator.predict_proba(X_val_fold)[:, 1]
  203. threshold = get_optimal_proba_threshold(
  204. score_func=score_func_threshold,
  205. y_true=y_val_fold,
  206. proba=proba_val)
  207. thresholds.append(threshold)
  208. scores["test_threshold"].append(np.mean(thresholds))
  209. if refit:
  210. estimator.fit(X_train, y_train)
  211. proba_val = estimator.predict_proba(X_val)[:, 1]
  212. proba_train = estimator.predict_proba(X_train)[:, 1]
  213. pred_train = (proba_train >= threshold)
  214. pred_val = (proba_val >= threshold)
  215. train_score = score_func_threshold(y_train, pred_train)
  216. test_score = score_func_threshold(y_val, pred_val)
  217. for metric_name, metric in scoring.items():
  218. scores["train_" + metric_name].append(metric(y_train, pred_train))
  219. scores["test_" + metric_name].append(metric(y_val, pred_val))
  220. scores["train_score_threshold"].append(train_score)
  221. scores["test_score_threshold"].append(test_score)
  222. return scores
  223. else:
  224. if isinstance(cv, int):
  225. cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
  226. cv_threshold = cv_threshold or []
  227. for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
  228. X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
  229. CVComposer().cv_slice_dataset(
  230. X=X_train,
  231. y=y_train,
  232. train_inds=train_inds,
  233. test_inds=val_inds)
  234. scores = cross_validate_with_optimal_threshold(
  235. estimator=estimator,
  236. score_func_threshold=score_func_threshold,
  237. X=X_train_fold,
  238. y=y_train_fold,
  239. X_val=X_val_fold,
  240. y_val=y_val_fold,
  241. cv_threshold=cv_fold,
  242. scoring=scoring,
  243. threshold_set=threshold_set,
  244. scores=scores)
  245. return scores
  246. if __name__ == "__main__":
  247. from sklearn.metrics import accuracy_score, precision_score
  248. from sklearn.datasets import load_breast_cancer
  249. from xgboost import XGBRFClassifier
  250. from sklearn.model_selection import train_test_split
  251. data_loader = load_breast_cancer()
  252. X = data_loader["data"]
  253. y = data_loader["target"]
  254. X_train, X_val, y_train, y_val = train_test_split(X, y)
  255. estimator = XGBRFClassifier(use_label_encoder=False,
  256. eval_metric="logloss")
  257. score_func = accuracy_score
  258. scoring = {"precision": precision_score}
  259. averaged_scores = []
  260. averaged_thresholds = []
  261. print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
  262. scores = cross_validate_with_optimal_threshold(
  263. score_func_threshold=accuracy_score,
  264. estimator=estimator,
  265. X=X_train,
  266. y=y_train,
  267. scoring=scoring,
  268. cv=None,
  269. X_val=X_val,
  270. y_val=y_val,
  271. X_val_threshold=None,
  272. y_val_threshold=None,
  273. cv_threshold=None)
  274. print("\nScores:", scores)
  275. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  276. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  277. print("\n ########################################################## \n")
  278. X_train, X_val_threshold, y_train, y_val_threshold =\
  279. train_test_split(X_train, y_train)
  280. print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
  281. scores = cross_validate_with_optimal_threshold(
  282. score_func_threshold=accuracy_score,
  283. estimator=estimator,
  284. X=X_train,
  285. y=y_train,
  286. scoring=scoring,
  287. cv=None,
  288. X_val=X_val,
  289. y_val=y_val,
  290. X_val_threshold=X_val_threshold,
  291. y_val_threshold=y_val_threshold,
  292. cv_threshold=None)
  293. print("\nScores:", scores)
  294. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  295. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  296. print("\n ########################################################## \n")
  297. print("\nTesting cv=None, cv_threshold=3 \n")
  298. scores = cross_validate_with_optimal_threshold(
  299. score_func_threshold=accuracy_score,
  300. estimator=estimator,
  301. X=X_train,
  302. y=y_train,
  303. scoring=scoring,
  304. cv=None,
  305. X_val=X_val,
  306. y_val=y_val,
  307. X_val_threshold=X_val_threshold,
  308. y_val_threshold=y_val_threshold,
  309. cv_threshold=3)
  310. print("\nScores:", scores)
  311. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  312. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  313. print("\n ########################################################## \n")
  314. print("\nTesting cv=3, cv_threshold=None \n")
  315. scores = cross_validate_with_optimal_threshold(
  316. score_func_threshold=accuracy_score,
  317. estimator=estimator,
  318. X=X_train,
  319. y=y_train,
  320. scoring=scoring,
  321. cv=3,
  322. X_val=None,
  323. y_val=None,
  324. X_val_threshold=None,
  325. y_val_threshold=None,
  326. cv_threshold=None)
  327. print("\nScores:", scores)
  328. print("\n ########################################################## \n")
  329. print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
  330. scores = cross_validate_with_optimal_threshold(
  331. score_func_threshold=accuracy_score,
  332. estimator=estimator,
  333. X=X_train,
  334. y=y_train,
  335. scoring=scoring,
  336. cv=3,
  337. X_val=X_val,
  338. y_val=y_val,
  339. X_val_threshold=X_val_threshold,
  340. y_val_threshold=y_val_threshold,
  341. cv_threshold=[3, 3, 3])
  342. print("\nScores:", scores)
  343. averaged_scores.append(np.mean(scores["test_score_threshold"]))
  344. averaged_thresholds.append(np.mean(scores["test_threshold"]))
  345. print("\n ########################################################## \n")
  346. # TODO: check overwriting X_train,
  347. # additional metrics append instead of overwrite
  348. # check the length of cv_threshold
  349. # test custom cv, cv_threshold
  350. print("\n Averaged test score:", averaged_scores)
  351. print("\n Averaged threshold:", averaged_thresholds)