|
@@ -8,52 +8,163 @@ Created on Thu Oct 29 13:58:23 2020
|
|
|
|
|
|
@description:
|
|
|
|
|
|
-* Input:
|
|
|
- - pipeline/hyperparameter space
|
|
|
- - data_train
|
|
|
- - cv
|
|
|
- - cv_folds
|
|
|
-
|
|
|
-* For each pipeline:
|
|
|
-
|
|
|
- -> Split data_train into folds according to cv
|
|
|
-
|
|
|
- -> For each fold:
|
|
|
-
|
|
|
- => get data_train_fold, data_test_fold, cv_fold
|
|
|
-
|
|
|
- => split data_train_fold into subfolds according to cv_fold
|
|
|
-
|
|
|
- => For each subfold:
|
|
|
-
|
|
|
- ==> get data_train_subfold, data_test_subfold
|
|
|
-
|
|
|
- ==> train pipeline on data_train_subfold
|
|
|
-
|
|
|
- ==> find best_threshold_subfold on data_test_subfold
|
|
|
-
|
|
|
- => Find averaged_threshold_fold averaged over best_threshold_subfold
|
|
|
-
|
|
|
- => train pipeline on data_train_fold
|
|
|
-
|
|
|
- => find score_fold on data_test_fold with proba_threshold_fold
|
|
|
-
|
|
|
- => find best_threshold_fold on data_test_fold
|
|
|
-
|
|
|
- -> find score averaged over score_fold
|
|
|
-
|
|
|
- -> find averaged_threshold averaged over best_threshold_fold
|
|
|
-
|
|
|
-* choose (pipeline/hyperparameters, threshold) in the space with best score
|
|
|
+scenario 1:
|
|
|
+
|
|
|
+ You have a train set and a validation set and you tune the probability
|
|
|
+ threshold on the validation set:
|
|
|
+
|
|
|
+ X = X_train,
|
|
|
+ y = y_train,
|
|
|
+ X_val = X_val,
|
|
|
+ y_val = y_val
|
|
|
+
|
|
|
+ and you set to None:
|
|
|
+
|
|
|
+ cv = None,
|
|
|
+ X_val_threshold = None,
|
|
|
+ y_val_threshold = None,
|
|
|
+ cv_threshold = None
|
|
|
+
|
|
|
+ Downsides:
|
|
|
+
|
|
|
+ 1) You return a single validation score
|
|
|
+ (cross validation score would be more robust)
|
|
|
+
|
|
|
+ 2) You fine tune on the same validation set as you calculate
|
|
|
+ the score on. Using an independent validation data set for fine tuning would be
|
|
|
+ more robust
|
|
|
+
|
|
|
+ 3) You fine tune the probability threshold on a single dataset.
|
|
|
+ It would be more robust to tune on several independent datasets
|
|
|
+ and take the average probability threshold.
|
|
|
+
|
|
|
+scenario 2:
|
|
|
+
|
|
|
+ You have a train set and a validation set and you tune the probability
|
|
|
+ threshold an independent set. You need to pass the independent data set
|
|
|
+ to the X_val_threshold and y_val_threshold parameter
|
|
|
+
|
|
|
+ X = X_train,
|
|
|
+ y = y_train,
|
|
|
+ X_val = X_val,
|
|
|
+ y_val = y_val,
|
|
|
+ X_val_thresold = X_val_indpendent,
|
|
|
+ y_val_threshold = y_val_independent
|
|
|
+
|
|
|
+ and you set to None:
|
|
|
+
|
|
|
+ cv = None,
|
|
|
+ cv_threshold = None
|
|
|
+
|
|
|
+ Downsides:
|
|
|
+
|
|
|
+ 1) You return a single validation score
|
|
|
+ (cross validation score would be more robust)
|
|
|
+
|
|
|
+
|
|
|
+ 2) You fine tune the probability threshold on a single dataset.
|
|
|
+ It would be more robust to tune on several independent datasets
|
|
|
+ and take the average probability threshold.
|
|
|
+
|
|
|
+
|
|
|
+scenario 3:
|
|
|
+
|
|
|
+ You have a dataset on which you want to calculate the cross-validation
|
|
|
+ score and a cv object. You fine tune the probability threshold on each fold,
|
|
|
+ using the validation part of the fold.
|
|
|
+
|
|
|
+ X = X_train,
|
|
|
+ y = y_train,
|
|
|
+ cv = cv
|
|
|
+
|
|
|
+ and you set to None:
|
|
|
+
|
|
|
+ X_val = None,
|
|
|
+ y_val = None,
|
|
|
+ X_val_thresold = None,
|
|
|
+ y_val_threshold = None
|
|
|
+ cv_threshold = None
|
|
|
+
|
|
|
+ Downsides:
|
|
|
+
|
|
|
+ 2) In each fold, you fine tune on the same validation set as you calculate
|
|
|
+ the score on. Using an independent validation data set for fine tuning would be
|
|
|
+ more robust
|
|
|
+
|
|
|
+ 3) In each fold, you fine tune the probability threshold on a single dataset.
|
|
|
+ It would be more robust to tune on several independent datasets
|
|
|
+ and take the average probability threshold.
|
|
|
+
|
|
|
+
|
|
|
+scenario 4:
|
|
|
+
|
|
|
+ You have a dataset on which you want to calculate the cross-validation
|
|
|
+ score and a cv object. You fine tune the probability threshold on independent
|
|
|
+ dataset (or multiple datasets) in each fold.
|
|
|
+
|
|
|
+ You need to have a cv_threshold object that tells you have to
|
|
|
+ split each of the folds of you cv.
|
|
|
+
|
|
|
+ Example 1:
|
|
|
+
|
|
|
+ cv = [((1, 2, 3, 4), (5, 6, 7)),
|
|
|
+ ((5, 6, 7, 8), (9, 10))]
|
|
|
+
|
|
|
+ cv_threshold = [ [(1,2), (3, 4)],
|
|
|
+ [(5, 6), (7, 8)]
|
|
|
+ ]
|
|
|
+
|
|
|
+ Example 2:
|
|
|
+
|
|
|
+ cv = 3
|
|
|
+ cv_threshold = [4, 4, 4]
|
|
|
+
|
|
|
+
|
|
|
+ Example 3:
|
|
|
+ cv = [((1, 2, 3, 4, 5, 6), (7, 8, 9)),
|
|
|
+ ((5, 6, 7, 8), (9, 10))]
|
|
|
+
|
|
|
+ cv_threshold = [ [((1, 2), (3, 4, 5)),
|
|
|
+ ((2, 3), (4, 5, 6))
|
|
|
+ ]
|
|
|
+ ]
|
|
|
+
|
|
|
+ #####################
|
|
|
+
|
|
|
+ X = X_train,
|
|
|
+ y = y_train,
|
|
|
+ cv = cv,
|
|
|
+ cv_threshold = cv_threshold
|
|
|
+
|
|
|
+ and you set to None:
|
|
|
+
|
|
|
+ X_val = None,
|
|
|
+ y_val = None,
|
|
|
+ X_val_thresold = None,
|
|
|
+ y_val_threshold = None
|
|
|
+
|
|
|
+ Downsides:
|
|
|
+
|
|
|
+ 2) In each fold, you fine tune on the same validation set as you calculate
|
|
|
+ the score on. Using an independent validation data set for fine tuning would be
|
|
|
+ more robust
|
|
|
+
|
|
|
+ 3) In each fold, you fine tune the probability threshold on a single dataset.
|
|
|
+ It would be more robust to tune on several independent datasets
|
|
|
+ and take the average probability threshold.
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
"""
|
|
|
|
|
|
import sys
|
|
|
|
|
|
-import pandas as pd
|
|
|
import numpy as np
|
|
|
from itertools import zip_longest
|
|
|
|
|
|
+from numpy.typing import ArrayLike
|
|
|
+
|
|
|
if sys.version_info >= (3, 8):
|
|
|
from typing import Callable, Dict, Iterable, Union
|
|
|
else:
|
|
@@ -67,44 +178,36 @@ from cdplib.log import Log
|
|
|
|
|
|
from cdplib.ml_validation.CVComposer import CVComposer
|
|
|
|
|
|
+from cdplib.fine_tuning import get_optimal_proba_threshold
|
|
|
|
|
|
-# TODO: write with yield !!!!
|
|
|
|
|
|
-def get_optimal_proba_threshold(score_func: Callable,
|
|
|
- y_true: Union[pd.Series, np.ndarray],
|
|
|
- proba: Union[pd.Series, np.ndarray],
|
|
|
- threshold_set: Union[Iterable, None] = None):
|
|
|
- """
|
|
|
- """
|
|
|
- scores = {}
|
|
|
-
|
|
|
- if threshold_set is None:
|
|
|
- threshold_set = np.arange(0, 1, 0.1)
|
|
|
-
|
|
|
- for threshold in threshold_set:
|
|
|
-
|
|
|
- y_pred = (proba >= threshold).astype(int)
|
|
|
-
|
|
|
- scores[threshold] = score_func(y_true, y_pred)
|
|
|
-
|
|
|
- return max(scores, key=scores.get)
|
|
|
+# TODO: write with yield !!!!
|
|
|
|
|
|
|
|
|
def cross_validate_with_optimal_threshold(
|
|
|
score_func_threshold: Callable,
|
|
|
estimator: object,
|
|
|
- X: Union[pd.DataFrame, np.ndarray],
|
|
|
- y: Union[pd.Series, np.ndarray, None] = None,
|
|
|
+ X: ArrayLike,
|
|
|
+ y: ArrayLike = None,
|
|
|
+ groups: ArrayLike = None,
|
|
|
scoring: Union[Callable, Dict] = None,
|
|
|
cv: Union[Iterable, int, None] = None,
|
|
|
- X_val: Union[pd.DataFrame, np.ndarray, None] = None,
|
|
|
- y_val: Union[pd.Series, np.ndarray, None] = None,
|
|
|
- X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
|
|
|
- y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
|
|
|
+ n_jobs: int = None,
|
|
|
+ verbose: int = None,
|
|
|
+ fit_params: Dict = None,
|
|
|
+ pre_dispatch: int = None,
|
|
|
+ return_train_score: bool = False,
|
|
|
+ return_estimator: bool = False,
|
|
|
+ error_score: float = np.nan,
|
|
|
+ X_val: ArrayLike = None,
|
|
|
+ y_val: ArrayLike = None,
|
|
|
+ X_val_threshold: ArrayLike = None,
|
|
|
+ y_val_threshold: ArrayLike = None,
|
|
|
cv_threshold: Union[Iterable, int, None] = None,
|
|
|
threshold_set: Union[Iterable, None] = None,
|
|
|
scores: Dict = None)-> Dict:
|
|
|
"""
|
|
|
+
|
|
|
"""
|
|
|
logger = Log("cross_validate_with_optimal_threshold:")
|
|
|
|