4 years ago · b5407894a3
--- a/cdplib/fine_tuning/FineTunedClassiferCV.py
+++ b/cdplib/fine_tuning/FineTunedClassiferCV.py
@@ -1,173 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Thu Apr 23 08:51:53 2020
			
 
				-
			
 
				-@author: tanya
			
 
				-
			
 
				-@description: class for fine-tuning a sklearn classifier
			
 
				-(optimizing the probability threshold)
			
 
				-"""
			
 
				-
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				-
			
 
				-from typing import Callable
			
 
				-
			
 
				-from sklearn.base import (BaseEstimator, ClassifierMixin,
			
 
				-                          clone, MetaEstimatorMixin)
			
 
				-
			
 
				-from cdplib.log import Log
			
 
				-
			
 
				-from cdplib.utils.TyperConverter import TypeConverter
			
 
				-
			
 
				-
			
 
				-class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
			
 
				-                            MetaEstimatorMixin):
			
 
				-    """
			
 
				-    Probability threshold tuning for a given estimator.
			
 
				-    Overrides the method predict of the given sklearn classifer
			
 
				-    and returns predictions with the optimal value of
			
 
				-    the probability threshold.
			
 
				-
			
 
				-    An object of this class can be passed to an sklearn Pipeline
			
 
				-    """
			
 
				-    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
			
 
				-                 cv=None, threshold_step: float = 0.1):
			
 
				-        """
			
 
				-        """
			
 
				-        self.estimator = estimator
			
 
				-
			
 
				-        self.is_fitted = False
			
 
				-
			
 
				-        self.greater_is_better = greater_is_better
			
 
				-
			
 
				-        if cv is None:
			
 
				-            self.cv = ...
			
 
				-        else:
			
 
				-            self.cv = cv
			
 
				-
			
 
				-        self.cost_func = cost_func
			
 
				-
			
 
				-        self.threshold_step = threshold_step
			
 
				-
			
 
				-        self.optimal_threshold = 0.5
			
 
				-
			
 
				-        self._logger = Log("FineTunedClassifyCV")
			
 
				-
			
 
				-    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
			
 
				-                            proba_pred: (pd.DataFrame, np.array)):
			
 
				-        '''
			
 
				-        '''
			
 
				-        costs = {}
			
 
				-
			
 
				-        for t in np.arange(self.threshold_step, 1, self.threshold_step):
			
 
				-            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
			
 
				-
			
 
				-        if self.greater_is_better:
			
 
				-            return max(costs, key=costs.get)
			
 
				-        else:
			
 
				-            return min(costs, key=costs.get)
			
 
				-
			
 
				-    def fit(self, X: (pd.DataFrame, np.array),
			
 
				-            y: (pd.DataFrame, np.array) = None,
			
 
				-            **fit_args):
			
 
				-        """
			
 
				-        """
			
 
				-        X = TypeConverter().convert_to_ndarray(X)
			
 
				-        if y is not None:
			
 
				-            y = TypeConverter().convert_to_ndarray(X)
			
 
				-
			
 
				-        optimal_thrs_per_fold = []
			
 
				-
			
 
				-        for train_inds, val_inds in self.cv:
			
 
				-            X_train, X_val = X[train_inds], X[val_inds]
			
 
				-
			
 
				-            if y is not None:
			
 
				-                y_train, y_val = y[train_inds], y[val_inds]
			
 
				-            else:
			
 
				-                y_train, y_val = None, None
			
 
				-
			
 
				-            estimator = clone(fine_tuned_clf.estimator)
			
 
				-
			
 
				-            estimator.fit(X_train, y_train, **fit_args)
			
 
				-
			
 
				-            proba_pred = estimator.predict_proba(X_val)
			
 
				-
			
 
				-            optimal_thr = self._get_best_threshold(y_val, proba_pred)
			
 
				-
			
 
				-            optimal_thrs_per_fold.append(optimal_thr)
			
 
				-
			
 
				-        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
			
 
				-
			
 
				-        self.estimator.fit(X, **fit_args)
			
 
				-
			
 
				-    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
			
 
				-        """
			
 
				-        """
			
 
				-        if self.is_fitted:
			
 
				-
			
 
				-            proba_pred = self.estimator.predict_proba(X)
			
 
				-
			
 
				-            return (proba_pred >= self.optimal_threshold).astype(int)
			
 
				-
			
 
				-        else:
			
 
				-            self._logger.warn("You should fit first")
			
 
				-
			
 
				-    def get_params(self):
			
 
				-        """
			
 
				-        """
			
 
				-        params = self.estimator.get_params()
			
 
				-
			
 
				-        params.update({"cv": self.cv, "cost_func": self.cost_func})
			
 
				-
			
 
				-        return params
			
 
				-
			
 
				-    def set_params(self, **params: dict):
			
 
				-        """
			
 
				-        """
			
 
				-        for param in params:
			
 
				-            if param == "cv":
			
 
				-                self.cv = params[param]
			
 
				-                params.pop(param)
			
 
				-
			
 
				-            elif param == "cost_func":
			
 
				-                self.cost_func = params[param]
			
 
				-                params.pop(param)
			
 
				-
			
 
				-        self.estimator.set_params(**params)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    # test
			
 
				-    from sklearn.datasets import load_iris
			
 
				-    from sklearn.metrics import accuracy_score
			
 
				-    import gc
			
 
				-    from xgboost import XGBRFClassifier
			
 
				-
			
 
				-    data = load_iris()
			
 
				-    X, y = data["data"], data["target"]
			
 
				-    y = (y==1).astype(int)
			
 
				-    del data
			
 
				-    gc.collect()
			
 
				-
			
 
				-    # make a custom cv object
			
 
				-    val_len = len(X)//10
			
 
				-    split_inds = range(len(X)//2, len(X), val_len)
			
 
				-
			
 
				-    cv = []
			
 
				-
			
 
				-    for i in split_inds:
			
 
				-        train_inds = list(range(i))
			
 
				-        val_inds = list(range(i, i + val_len))
			
 
				-        cv.append((train_inds, val_inds))
			
 
				-
			
 
				-    clf = XGBRFClassifier()
			
 
				-
			
 
				-    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
			
 
				-                                           cv=cv,
			
 
				-                                           greater_is_better=True,
			
 
				-                                           cost_func=accuracy_score)
			
 
				-
			
 
				-    fine_tuned_clf.fit(X=X, y=y)
			
 
				-
			
--- a/cdplib/fine_tuning/__init__.py
+++ b/cdplib/fine_tuning/__init__.py
@@ -1 +0,0 @@
 
				-from .FineTunedClassiferCV import *
			
--- a/cdplib/ml_validation/cross_validate_with_fine_tuning.py
+++ b/cdplib/ml_validation/cross_validate_with_fine_tuning.py
@@ -8,52 +8,163 @@ Created on Thu Oct 29 13:58:23 2020
 
				 
			
 
				 @description:
			
 
				 
			
 
				-* Input:
			
 
				-    - pipeline/hyperparameter space
			
 
				-    - data_train
			
 
				-    - cv
			
 
				-    - cv_folds
			
 
				-
			
 
				-* For each pipeline:
			
 
				-
			
 
				-    -> Split data_train into folds according to cv
			
 
				-
			
 
				-     -> For each fold:
			
 
				-
			
 
				-         => get data_train_fold, data_test_fold, cv_fold
			
 
				-
			
 
				-         => split data_train_fold into subfolds according to cv_fold
			
 
				-
			
 
				-         => For each subfold:
			
 
				-
			
 
				-             ==> get data_train_subfold, data_test_subfold
			
 
				-
			
 
				-             ==> train pipeline on data_train_subfold
			
 
				-
			
 
				-             ==> find best_threshold_subfold on data_test_subfold
			
 
				-
			
 
				-        => Find averaged_threshold_fold averaged over best_threshold_subfold
			
 
				-
			
 
				-        => train pipeline on data_train_fold
			
 
				-
			
 
				-        => find score_fold on data_test_fold with proba_threshold_fold
			
 
				-
			
 
				-        => find best_threshold_fold on data_test_fold
			
 
				-
			
 
				-    -> find score averaged over score_fold
			
 
				-
			
 
				-    -> find averaged_threshold averaged over best_threshold_fold
			
 
				-
			
 
				-* choose (pipeline/hyperparameters, threshold) in the space with best score
			
 
				+scenario 1:
			
 
				+    
			
 
				+    You have a train set and a validation set and you tune the probability
			
 
				+    threshold on the validation set:
			
 
				+        
			
 
				+        X = X_train,
			
 
				+        y = y_train,
			
 
				+        X_val = X_val,
			
 
				+        y_val = y_val
			
 
				+        
			
 
				+        and you set to None:
			
 
				+            
			
 
				+        cv = None,
			
 
				+        X_val_threshold = None,
			
 
				+        y_val_threshold = None,
			
 
				+        cv_threshold = None
			
 
				+        
			
 
				+    Downsides:
			
 
				+        
			
 
				+    1) You return a single validation score
			
 
				+        (cross validation score would be more robust)
			
 
				+        
			
 
				+    2) You fine tune on the same validation set as you calculate
			
 
				+    the score on. Using an independent validation data set for fine tuning would be
			
 
				+    more robust
			
 
				+    
			
 
				+    3) You fine tune the probability threshold on a single dataset.
			
 
				+    It would be more robust to tune on several independent datasets
			
 
				+    and take the average probability threshold.
			
 
				+        
			
 
				+scenario 2:
			
 
				+    
			
 
				+    You have a train set and a validation set and you tune the probability
			
 
				+    threshold an independent set. You need to pass the independent data set
			
 
				+    to the X_val_threshold and y_val_threshold parameter
			
 
				+        
			
 
				+        X = X_train,
			
 
				+        y = y_train,
			
 
				+        X_val = X_val,
			
 
				+        y_val = y_val,
			
 
				+        X_val_thresold = X_val_indpendent,
			
 
				+        y_val_threshold = y_val_independent
			
 
				+        
			
 
				+        and you set to None:
			
 
				+            
			
 
				+        cv = None,
			
 
				+        cv_threshold = None
			
 
				+        
			
 
				+    Downsides:
			
 
				+        
			
 
				+    1) You return a single validation score
			
 
				+        (cross validation score would be more robust)
			
 
				+
			
 
				+    
			
 
				+    2) You fine tune the probability threshold on a single dataset.
			
 
				+    It would be more robust to tune on several independent datasets
			
 
				+    and take the average probability threshold.
			
 
				+    
			
 
				+    
			
 
				+scenario 3:
			
 
				+    
			
 
				+    You have a dataset on which you want to calculate the cross-validation
			
 
				+    score and a cv object. You fine tune the probability threshold on each fold,
			
 
				+    using the validation part of the fold.
			
 
				+        
			
 
				+        X = X_train,
			
 
				+        y = y_train,
			
 
				+        cv = cv
			
 
				+        
			
 
				+        and you set to None:
			
 
				+            
			
 
				+        X_val = None,
			
 
				+        y_val = None,
			
 
				+        X_val_thresold = None,
			
 
				+        y_val_threshold = None
			
 
				+        cv_threshold = None
			
 
				+        
			
 
				+    Downsides:
			
 
				+        
			
 
				+    2) In each fold, you fine tune on the same validation set as you calculate
			
 
				+    the score on. Using an independent validation data set for fine tuning would be
			
 
				+    more robust
			
 
				+    
			
 
				+    3) In each fold, you fine tune the probability threshold on a single dataset.
			
 
				+    It would be more robust to tune on several independent datasets
			
 
				+    and take the average probability threshold.
			
 
				+    
			
 
				+    
			
 
				+scenario 4:
			
 
				+    
			
 
				+    You have a dataset on which you want to calculate the cross-validation
			
 
				+    score and a cv object. You fine tune the probability threshold on independent 
			
 
				+    dataset (or multiple datasets) in each fold.
			
 
				+    
			
 
				+    You need to have a cv_threshold object that tells you have to 
			
 
				+    split each of the folds of you cv.
			
 
				+    
			
 
				+    Example 1:
			
 
				+        
			
 
				+    cv = [((1, 2, 3, 4), (5, 6, 7)),
			
 
				+          ((5, 6, 7, 8), (9, 10))]
			
 
				+    
			
 
				+    cv_threshold = [ [(1,2), (3, 4)],
			
 
				+                     [(5, 6), (7, 8)]  
			
 
				+                   ]
			
 
				+    
			
 
				+    Example 2:
			
 
				+        
			
 
				+    cv = 3
			
 
				+    cv_threshold = [4, 4, 4]
			
 
				+    
			
 
				+    
			
 
				+    Example 3:
			
 
				+        cv = [((1, 2, 3, 4, 5, 6), (7, 8, 9)),
			
 
				+              ((5, 6, 7, 8), (9, 10))]
			
 
				+    
			
 
				+    cv_threshold = [ [((1, 2), (3, 4, 5)),
			
 
				+                      ((2, 3), (4, 5, 6))
			
 
				+                     ]
			
 
				+                   ]
			
 
				+        
			
 
				+    #####################
			
 
				+    
			
 
				+        X = X_train,
			
 
				+        y = y_train,
			
 
				+        cv = cv,
			
 
				+        cv_threshold = cv_threshold
			
 
				+        
			
 
				+        and you set to None:
			
 
				+            
			
 
				+        X_val = None,
			
 
				+        y_val = None,
			
 
				+        X_val_thresold = None,
			
 
				+        y_val_threshold = None
			
 
				+        
			
 
				+    Downsides:
			
 
				+        
			
 
				+    2) In each fold, you fine tune on the same validation set as you calculate
			
 
				+    the score on. Using an independent validation data set for fine tuning would be
			
 
				+    more robust
			
 
				+    
			
 
				+    3) In each fold, you fine tune the probability threshold on a single dataset.
			
 
				+    It would be more robust to tune on several independent datasets
			
 
				+    and take the average probability threshold.
			
 
				+    
			
 
				+        
			
 
				+        
			
 
				 
			
 
				 """
			
 
				 
			
 
				 import sys
			
 
				 
			
 
				-import pandas as pd
			
 
				 import numpy as np
			
 
				 from itertools import zip_longest
			
 
				 
			
 
				+from numpy.typing import ArrayLike
			
 
				+
			
 
				 if sys.version_info >= (3, 8):
			
 
				     from typing import Callable, Dict, Iterable, Union
			
 
				 else:
			
@@ -67,44 +178,36 @@ from cdplib.log import Log
 
				 
			
 
				 from cdplib.ml_validation.CVComposer import CVComposer
			
 
				 
			
 
				+from cdplib.fine_tuning import get_optimal_proba_threshold
			
 
				 
			
 
				-# TODO: write with yield !!!!
			
 
				 
			
 
				-def get_optimal_proba_threshold(score_func: Callable,
			
 
				-                                y_true: Union[pd.Series, np.ndarray],
			
 
				-                                proba: Union[pd.Series, np.ndarray],
			
 
				-                                threshold_set: Union[Iterable, None] = None):
			
 
				-    """
			
 
				-    """
			
 
				-    scores = {}
			
 
				-
			
 
				-    if threshold_set is None:
			
 
				-        threshold_set = np.arange(0, 1, 0.1)
			
 
				-
			
 
				-    for threshold in threshold_set:
			
 
				-
			
 
				-        y_pred = (proba >= threshold).astype(int)
			
 
				-
			
 
				-        scores[threshold] = score_func(y_true, y_pred)
			
 
				-
			
 
				-    return max(scores, key=scores.get)
			
 
				+# TODO: write with yield !!!!
			
 
				 
			
 
				 
			
 
				 def cross_validate_with_optimal_threshold(
			
 
				         score_func_threshold: Callable,
			
 
				         estimator: object,
			
 
				-        X: Union[pd.DataFrame, np.ndarray],
			
 
				-        y: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        X: ArrayLike,
			
 
				+        y: ArrayLike = None,
			
 
				+        groups: ArrayLike = None,
			
 
				         scoring: Union[Callable, Dict] = None,
			
 
				         cv: Union[Iterable, int, None] = None,
			
 
				-        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
			
 
				-        y_val: Union[pd.Series, np.ndarray, None] = None,
			
 
				-        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
			
 
				-        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        n_jobs: int = None,
			
 
				+        verbose: int = None,
			
 
				+        fit_params: Dict = None,
			
 
				+        pre_dispatch: int = None,
			
 
				+        return_train_score: bool = False,
			
 
				+        return_estimator: bool = False,
			
 
				+        error_score: float = np.nan, 
			
 
				+        X_val: ArrayLike = None,
			
 
				+        y_val: ArrayLike = None,
			
 
				+        X_val_threshold: ArrayLike = None,
			
 
				+        y_val_threshold: ArrayLike = None,
			
 
				         cv_threshold: Union[Iterable, int, None] = None,
			
 
				         threshold_set: Union[Iterable, None] = None,
			
 
				         scores: Dict = None)-> Dict:
			
 
				     """
			
 
				+    
			
 
				     """
			
 
				     logger = Log("cross_validate_with_optimal_threshold:")
			
 
				 
			
--- a/cdplib/ml_validation/fine_tuning.py
+++ b/cdplib/ml_validation/fine_tuning.py
@@ -0,0 +1,38 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Sat May  1 13:46:42 2021
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+from numpy.typing import ArrayLike
			
 
				+
			
 
				+if sys.version_info >= (3, 8):
			
 
				+    from typing import Iterable, Callable
			
 
				+else:
			
 
				+    from typing_extensions import Iterable, Callable
			
 
				+
			
 
				+
			
 
				+def get_optimal_proba_threshold(score_func: Callable,
			
 
				+                                y_true: ArrayLike,
			
 
				+                                proba: ArrayLike,
			
 
				+                                threshold_set: Iterable = None):
			
 
				+    """
			
 
				+    """
			
 
				+    scores = {}
			
 
				+
			
 
				+    if threshold_set is None:
			
 
				+        threshold_set = np.arange(0, 1, 0.1)
			
 
				+
			
 
				+    for threshold in threshold_set:
			
 
				+
			
 
				+        y_pred = (proba >= threshold).astype(int)
			
 
				+
			
 
				+        scores[threshold] = score_func(y_true, y_pred)
			
 
				+
			
 
				+    return max(scores, key=scores.get)