2 커밋 60abdd6dc6 ... 8edcdaca41

작성자 SHA1 메시지 날짜
  tanja 8edcdaca41 trying to solve merge conflicts 3 년 전
  tanja b5407894a3 added comments to validation with fine tuning 3 년 전

+ 0 - 173
cdplib/fine_tuning/FineTunedClassiferCV.py

@@ -1,173 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Apr 23 08:51:53 2020
-
-@author: tanya
-
-@description: class for fine-tuning a sklearn classifier
-(optimizing the probability threshold)
-"""
-
-import pandas as pd
-import numpy as np
-
-from typing import Callable
-
-from sklearn.base import (BaseEstimator, ClassifierMixin,
-                          clone, MetaEstimatorMixin)
-
-from cdplib.log import Log
-
-from cdplib.utils.TyperConverter import TypeConverter
-
-
-class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
-                            MetaEstimatorMixin):
-    """
-    Probability threshold tuning for a given estimator.
-    Overrides the method predict of the given sklearn classifer
-    and returns predictions with the optimal value of
-    the probability threshold.
-
-    An object of this class can be passed to an sklearn Pipeline
-    """
-    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
-                 cv=None, threshold_step: float = 0.1):
-        """
-        """
-        self.estimator = estimator
-
-        self.is_fitted = False
-
-        self.greater_is_better = greater_is_better
-
-        if cv is None:
-            self.cv = ...
-        else:
-            self.cv = cv
-
-        self.cost_func = cost_func
-
-        self.threshold_step = threshold_step
-
-        self.optimal_threshold = 0.5
-
-        self._logger = Log("FineTunedClassifyCV")
-
-    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
-                            proba_pred: (pd.DataFrame, np.array)):
-        '''
-        '''
-        costs = {}
-
-        for t in np.arange(self.threshold_step, 1, self.threshold_step):
-            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
-
-        if self.greater_is_better:
-            return max(costs, key=costs.get)
-        else:
-            return min(costs, key=costs.get)
-
-    def fit(self, X: (pd.DataFrame, np.array),
-            y: (pd.DataFrame, np.array) = None,
-            **fit_args):
-        """
-        """
-        X = TypeConverter().convert_to_ndarray(X)
-        if y is not None:
-            y = TypeConverter().convert_to_ndarray(X)
-
-        optimal_thrs_per_fold = []
-
-        for train_inds, val_inds in self.cv:
-            X_train, X_val = X[train_inds], X[val_inds]
-
-            if y is not None:
-                y_train, y_val = y[train_inds], y[val_inds]
-            else:
-                y_train, y_val = None, None
-
-            estimator = clone(fine_tuned_clf.estimator)
-
-            estimator.fit(X_train, y_train, **fit_args)
-
-            proba_pred = estimator.predict_proba(X_val)
-
-            optimal_thr = self._get_best_threshold(y_val, proba_pred)
-
-            optimal_thrs_per_fold.append(optimal_thr)
-
-        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
-
-        self.estimator.fit(X, **fit_args)
-
-    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
-        """
-        """
-        if self.is_fitted:
-
-            proba_pred = self.estimator.predict_proba(X)
-
-            return (proba_pred >= self.optimal_threshold).astype(int)
-
-        else:
-            self._logger.warn("You should fit first")
-
-    def get_params(self):
-        """
-        """
-        params = self.estimator.get_params()
-
-        params.update({"cv": self.cv, "cost_func": self.cost_func})
-
-        return params
-
-    def set_params(self, **params: dict):
-        """
-        """
-        for param in params:
-            if param == "cv":
-                self.cv = params[param]
-                params.pop(param)
-
-            elif param == "cost_func":
-                self.cost_func = params[param]
-                params.pop(param)
-
-        self.estimator.set_params(**params)
-
-
-if __name__ == "__main__":
-    # test
-    from sklearn.datasets import load_iris
-    from sklearn.metrics import accuracy_score
-    import gc
-    from xgboost import XGBRFClassifier
-
-    data = load_iris()
-    X, y = data["data"], data["target"]
-    y = (y==1).astype(int)
-    del data
-    gc.collect()
-
-    # make a custom cv object
-    val_len = len(X)//10
-    split_inds = range(len(X)//2, len(X), val_len)
-
-    cv = []
-
-    for i in split_inds:
-        train_inds = list(range(i))
-        val_inds = list(range(i, i + val_len))
-        cv.append((train_inds, val_inds))
-
-    clf = XGBRFClassifier()
-
-    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
-                                           cv=cv,
-                                           greater_is_better=True,
-                                           cost_func=accuracy_score)
-
-    fine_tuned_clf.fit(X=X, y=y)
-

+ 0 - 1
cdplib/fine_tuning/__init__.py

@@ -1 +0,0 @@
-from .FineTunedClassiferCV import *

+ 166 - 63
cdplib/ml_validation/cross_validate_with_fine_tuning.py

@@ -8,52 +8,163 @@ Created on Thu Oct 29 13:58:23 2020
 
 @description:
 
-* Input:
-    - pipeline/hyperparameter space
-    - data_train
-    - cv
-    - cv_folds
-
-* For each pipeline:
-
-    -> Split data_train into folds according to cv
-
-     -> For each fold:
-
-         => get data_train_fold, data_test_fold, cv_fold
-
-         => split data_train_fold into subfolds according to cv_fold
-
-         => For each subfold:
-
-             ==> get data_train_subfold, data_test_subfold
-
-             ==> train pipeline on data_train_subfold
-
-             ==> find best_threshold_subfold on data_test_subfold
-
-        => Find averaged_threshold_fold averaged over best_threshold_subfold
-
-        => train pipeline on data_train_fold
-
-        => find score_fold on data_test_fold with proba_threshold_fold
-
-        => find best_threshold_fold on data_test_fold
-
-    -> find score averaged over score_fold
-
-    -> find averaged_threshold averaged over best_threshold_fold
-
-* choose (pipeline/hyperparameters, threshold) in the space with best score
+scenario 1:
+    
+    You have a train set and a validation set and you tune the probability
+    threshold on the validation set:
+        
+        X = X_train,
+        y = y_train,
+        X_val = X_val,
+        y_val = y_val
+        
+        and you set to None:
+            
+        cv = None,
+        X_val_threshold = None,
+        y_val_threshold = None,
+        cv_threshold = None
+        
+    Downsides:
+        
+    1) You return a single validation score
+        (cross validation score would be more robust)
+        
+    2) You fine tune on the same validation set as you calculate
+    the score on. Using an independent validation data set for fine tuning would be
+    more robust
+    
+    3) You fine tune the probability threshold on a single dataset.
+    It would be more robust to tune on several independent datasets
+    and take the average probability threshold.
+        
+scenario 2:
+    
+    You have a train set and a validation set and you tune the probability
+    threshold an independent set. You need to pass the independent data set
+    to the X_val_threshold and y_val_threshold parameter
+        
+        X = X_train,
+        y = y_train,
+        X_val = X_val,
+        y_val = y_val,
+        X_val_thresold = X_val_indpendent,
+        y_val_threshold = y_val_independent
+        
+        and you set to None:
+            
+        cv = None,
+        cv_threshold = None
+        
+    Downsides:
+        
+    1) You return a single validation score
+        (cross validation score would be more robust)
+
+    
+    2) You fine tune the probability threshold on a single dataset.
+    It would be more robust to tune on several independent datasets
+    and take the average probability threshold.
+    
+    
+scenario 3:
+    
+    You have a dataset on which you want to calculate the cross-validation
+    score and a cv object. You fine tune the probability threshold on each fold,
+    using the validation part of the fold.
+        
+        X = X_train,
+        y = y_train,
+        cv = cv
+        
+        and you set to None:
+            
+        X_val = None,
+        y_val = None,
+        X_val_thresold = None,
+        y_val_threshold = None
+        cv_threshold = None
+        
+    Downsides:
+        
+    2) In each fold, you fine tune on the same validation set as you calculate
+    the score on. Using an independent validation data set for fine tuning would be
+    more robust
+    
+    3) In each fold, you fine tune the probability threshold on a single dataset.
+    It would be more robust to tune on several independent datasets
+    and take the average probability threshold.
+    
+    
+scenario 4:
+    
+    You have a dataset on which you want to calculate the cross-validation
+    score and a cv object. You fine tune the probability threshold on independent 
+    dataset (or multiple datasets) in each fold.
+    
+    You need to have a cv_threshold object that tells you have to 
+    split each of the folds of you cv.
+    
+    Example 1:
+        
+    cv = [((1, 2, 3, 4), (5, 6, 7)),
+          ((5, 6, 7, 8), (9, 10))]
+    
+    cv_threshold = [ [(1,2), (3, 4)],
+                     [(5, 6), (7, 8)]  
+                   ]
+    
+    Example 2:
+        
+    cv = 3
+    cv_threshold = [4, 4, 4]
+    
+    
+    Example 3:
+        cv = [((1, 2, 3, 4, 5, 6), (7, 8, 9)),
+              ((5, 6, 7, 8), (9, 10))]
+    
+    cv_threshold = [ [((1, 2), (3, 4, 5)),
+                      ((2, 3), (4, 5, 6))
+                     ]
+                   ]
+        
+    #####################
+    
+        X = X_train,
+        y = y_train,
+        cv = cv,
+        cv_threshold = cv_threshold
+        
+        and you set to None:
+            
+        X_val = None,
+        y_val = None,
+        X_val_thresold = None,
+        y_val_threshold = None
+        
+    Downsides:
+        
+    2) In each fold, you fine tune on the same validation set as you calculate
+    the score on. Using an independent validation data set for fine tuning would be
+    more robust
+    
+    3) In each fold, you fine tune the probability threshold on a single dataset.
+    It would be more robust to tune on several independent datasets
+    and take the average probability threshold.
+    
+        
+        
 
 """
 
 import sys
 
-import pandas as pd
 import numpy as np
 from itertools import zip_longest
 
+from numpy.typing import ArrayLike
+
 if sys.version_info >= (3, 8):
     from typing import Callable, Dict, Iterable, Union
 else:
@@ -67,44 +178,36 @@ from cdplib.log import Log
 
 from cdplib.ml_validation.CVComposer import CVComposer
 
+from cdplib.fine_tuning import get_optimal_proba_threshold
 
-# TODO: write with yield !!!!
 
-def get_optimal_proba_threshold(score_func: Callable,
-                                y_true: Union[pd.Series, np.ndarray],
-                                proba: Union[pd.Series, np.ndarray],
-                                threshold_set: Union[Iterable, None] = None):
-    """
-    """
-    scores = {}
-
-    if threshold_set is None:
-        threshold_set = np.arange(0, 1, 0.1)
-
-    for threshold in threshold_set:
-
-        y_pred = (proba >= threshold).astype(int)
-
-        scores[threshold] = score_func(y_true, y_pred)
-
-    return max(scores, key=scores.get)
+# TODO: write with yield !!!!
 
 
 def cross_validate_with_optimal_threshold(
         score_func_threshold: Callable,
         estimator: object,
-        X: Union[pd.DataFrame, np.ndarray],
-        y: Union[pd.Series, np.ndarray, None] = None,
+        X: ArrayLike,
+        y: ArrayLike = None,
+        groups: ArrayLike = None,
         scoring: Union[Callable, Dict] = None,
         cv: Union[Iterable, int, None] = None,
-        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
-        y_val: Union[pd.Series, np.ndarray, None] = None,
-        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
-        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
+        n_jobs: int = None,
+        verbose: int = None,
+        fit_params: Dict = None,
+        pre_dispatch: int = None,
+        return_train_score: bool = False,
+        return_estimator: bool = False,
+        error_score: float = np.nan, 
+        X_val: ArrayLike = None,
+        y_val: ArrayLike = None,
+        X_val_threshold: ArrayLike = None,
+        y_val_threshold: ArrayLike = None,
         cv_threshold: Union[Iterable, int, None] = None,
         threshold_set: Union[Iterable, None] = None,
         scores: Dict = None)-> Dict:
     """
+    
     """
     logger = Log("cross_validate_with_optimal_threshold:")
 

+ 38 - 0
cdplib/ml_validation/fine_tuning.py

@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Sat May  1 13:46:42 2021
+
+@author: tanya
+"""
+
+import sys
+
+import numpy as np
+
+from numpy.typing import ArrayLike
+
+if sys.version_info >= (3, 8):
+    from typing import Iterable, Callable
+else:
+    from typing_extensions import Iterable, Callable
+
+
+def get_optimal_proba_threshold(score_func: Callable,
+                                y_true: ArrayLike,
+                                proba: ArrayLike,
+                                threshold_set: Iterable = None):
+    """
+    """
+    scores = {}
+
+    if threshold_set is None:
+        threshold_set = np.arange(0, 1, 0.1)
+
+    for threshold in threshold_set:
+
+        y_pred = (proba >= threshold).astype(int)
+
+        scores[threshold] = score_func(y_true, y_pred)
+
+    return max(scores, key=scores.get)