Browse Source

wip in ml_validation

tanja 3 years ago
parent
commit
9d215fc661

+ 65 - 0
cdplib/ml_validation/CVComposer.py

@@ -205,3 +205,68 @@ class CVComposer:
             self._logger.log_and_raise_error(
                     ("Failed to make sliding window cv. "
                      "Exit with error: {}".format(e)))
+            
+    def nested_expanding_cv(self,
+            test_proportion: float,
+            start_train_proportion: float,
+            step_proportion: float = None,
+            expanding_test_size: bool = False,
+            data_set_size: Union[float, None] = None,
+            index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Iterable[Tuple[List]]:
+        """
+        """
+        logger = Log("make_nested_expanding_cv:")
+    
+        try:
+            cv = self.expanding_cv(test_proportion=test_proportion,
+                                   start_train_proportion=start_train_proportion,
+                                   step_proportion=step_proportion,
+                                   expanding_test_size=expanding_test_size,
+                                   data_set_size=data_set_size,
+                                   index=index)
+    
+            nested_cv = []
+    
+            for train_inds, test_inds in cv:
+    
+                fold_index = train_inds if index is not None\
+                    else None
+    
+                fold_size = len(train_inds) if index is None else None
+    
+                fold_cv = self.expanding_cv(
+                        test_proportion=test_proportion,
+                        start_train_proportion=start_train_proportion,
+                        step_proportion=step_proportion,
+                        expanding_test_size=expanding_test_size,
+                        data_set_size=fold_size,
+                        index=fold_index)
+    
+                nested_cv.append(list(fold_cv))
+    
+            return nested_cv
+    
+        except Exception as e:
+            logger.log_and_raise_error(("Failed to make nested expanding cv. "
+                                        "Exit with error: {}".format(e)))
+    
+    
+    def cv_slice_dataset(self, X, y, train_inds, test_inds)\
+            -> Tuple[Union[pd.DataFrame, np.ndarray],
+                     Union[pd.Series, np.ndarray]]:
+        """
+        """
+        if isinstance(X, pd.DataFrame):
+            X_train = X.loc[train_inds]
+            X_val = X.loc[test_inds]
+        else:
+            X_train = X[train_inds]
+            X_val = X[test_inds]
+    
+        if y is not None:
+            y_train = y[train_inds]
+            y_val = y[test_inds]
+    
+        return X_train, X_val, y_train, y_val
+

+ 1 - 2
cdplib/ml_validation/__init__.py

@@ -1,3 +1,2 @@
 from .cross_validate_with_fine_tuning import *
-from .CVComposer import *
-from .expanding_cv import *
+from .CVComposer import *

+ 25 - 79
cdplib/ml_validation/cross_validate_with_fine_tuning.py

@@ -48,84 +48,27 @@ Created on Thu Oct 29 13:58:23 2020
 
 """
 
+import sys
+
 import pandas as pd
 import numpy as np
 from itertools import zip_longest
-from typing import Union, Callable, Dict, Iterable, Tuple, List
+
+if sys.version_info >= (3, 8):
+    from typing import Callable, Dict, Iterable, Union
+else:
+    from typing_extensions import Callable, Dict, Iterable, Union
+
 from copy import deepcopy
-from itertools import accumulate, repeat, takewhile, chain
 
 from sklearn.model_selection import StratifiedKFold
 
 from cdplib.log import Log
 
+from cdplib.ml_validation.CVComposer import CVComposer
 
-# TODO: write with yield !!!!
-
-def make_nested_expanding_cv(
-        test_proportion: float,
-        start_train_proportion: float,
-        step_proportion: float = None,
-        expanding_test_size: bool = False,
-        data_set_size: Union[float, None] = None,
-        index: Union[pd.Series, np.ndarray, list, None] = None)\
-        -> Iterable[Tuple[List]]:
-    """
-    """
-    logger = Log("make_nested_expanding_cv:")
-
-    try:
-        cv = make_expanding_cv(test_proportion=test_proportion,
-                               start_train_proportion=start_train_proportion,
-                               step_proportion=step_proportion,
-                               expanding_test_size=expanding_test_size,
-                               data_set_size=data_set_size,
-                               index=index)
-
-        nested_cv = []
-
-        for train_inds, test_inds in cv:
-
-            fold_index = train_inds if index is not None\
-                else None
-
-            fold_size = len(train_inds) if index is None else None
-
-            fold_cv = make_expanding_cv(
-                    test_proportion=test_proportion,
-                    start_train_proportion=start_train_proportion,
-                    step_proportion=step_proportion,
-                    expanding_test_size=expanding_test_size,
-                    data_set_size=fold_size,
-                    index=fold_index)
-
-            nested_cv.append(list(fold_cv))
-
-        return nested_cv
-
-    except Exception as e:
-        logger.log_and_raise_error(("Failed to make nested expanding cv. "
-                                    "Exit with error: {}".format(e)))
-
-
-def cv_slice_dataset(X, y, train_inds, test_inds)\
-        -> Tuple[Union[pd.DataFrame, np.ndarray],
-                 Union[pd.Series, np.ndarray]]:
-    """
-    """
-    if isinstance(X, pd.DataFrame):
-        X_train = X.loc[train_inds]
-        X_val = X.loc[test_inds]
-    else:
-        X_train = X[train_inds]
-        X_val = X[test_inds]
-
-    if y is not None:
-        y_train = y[train_inds]
-        y_val = y[test_inds]
-
-    return X_train, X_val, y_train, y_val
 
+# TODO: write with yield !!!!
 
 def get_optimal_proba_threshold(score_func: Callable,
                                 y_true: Union[pd.Series, np.ndarray],
@@ -202,11 +145,12 @@ def cross_validate_with_optimal_threshold(
             X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
             y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
 
-            cv_threshold, X_train, y_train = make_dummy_cv(
-                    X_train=X_train,
+            cv_threshold, X_train, y_train =\
+                CVComposer().dummy_cv_and_concatenated_data_set(
+                    X_train=X_train, 
+                    X_test=X_val_threshold,
                     y_train=y_train,
-                    X_val=X_val_threshold,
-                    y_val=y_val_threshold)
+                    y_test=y_val_threshold)
         else:
 
             # if cv_threshold is given, we find the optimal threshold
@@ -230,10 +174,11 @@ def cross_validate_with_optimal_threshold(
             print("----- In cv threshold fold")
 
             X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
-                cv_slice_dataset(X=X_train,
-                                 y=y_train,
-                                 train_inds=train_inds,
-                                 test_inds=val_inds)
+                CVComposer.cv_slice_dataset(
+                    X=X_train,
+                    y=y_train,
+                    train_inds=train_inds,
+                    test_inds=val_inds)
 
             estimator.fit(X_train_fold, y_train_fold)
 
@@ -284,10 +229,11 @@ def cross_validate_with_optimal_threshold(
             print("=== In cv fold")
 
             X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
-                cv_slice_dataset(X=X_train,
-                                 y=y_train,
-                                 train_inds=train_inds,
-                                 test_inds=val_inds)
+                CVComposer().cv_slice_dataset(
+                    X=X_train,
+                    y=y_train,
+                    train_inds=train_inds,
+                    test_inds=val_inds)
 
             scores = cross_validate_with_optimal_threshold(
                     estimator=estimator,

+ 0 - 97
cdplib/ml_validation/expanding_cv.py

@@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Dec  9 09:55:52 2020
-
-@author: tanya
-"""
-
-from typing import Union, Iterable, Tuple, List
-import pandas as pd
-import numpy as np
-from itertools import accumulate, repeat, takewhile
-
-from cdplib.log import Log
-
-
-def make_expanding_cv(test_proportion: float,
-                      start_train_proportion: float,
-                      step_proportion: float = None,
-                      expanding_test_size: bool = False,
-                      data_set_size: Union[float, None] = None,
-                      index: Union[pd.Series, np.ndarray, list, None] = None)\
-        -> Union[Iterable[Tuple[List]], None]:
-    """
-
-    """
-    logger = Log("make_expanding_cv:")
-
-    try:
-        assert((index is None) != (data_set_size is None)),\
-            "Set index or data_set_size"
-
-        index = index if (index is not None)\
-            else pd.Series(range(data_set_size))
-
-        data_set_size = data_set_size or len(index)
-
-        start_train_size = int(start_train_proportion * data_set_size)
-        step_size = int(step_proportion * data_set_size)
-
-        test_size = int(test_proportion * data_set_size)
-
-        train_inds_set = (list(range(train_size))
-                          for train_size in
-                          takewhile(
-                                  lambda x: x <= data_set_size - test_size,
-                                  accumulate(repeat(start_train_size),
-                                             lambda x, _: x + step_size)))
-
-        for train_inds in train_inds_set:
-
-            if expanding_test_size:
-
-                yield (index[train_inds],
-                       index[train_inds[-1] + 1:
-                             train_inds[-1] + 1
-                             + int(test_proportion*len(train_inds))])
-
-            else:
-
-                yield (index[train_inds],
-                       index[train_inds[-1] + 1:
-                             train_inds[-1] + 1 + test_size])
-
-    except Exception as e:
-        logger.log_and_raise_error(("Failed to make expanding cv. "
-                                    "Exit with error: {}".format(e)))
-
-
-if __name__ == "__main__":
-
-    logger = Log("Test_expanding_cv: ")
-
-    logger.info("Start Testing")
-
-    logger.info("Testing expanding cv: ")
-
-    cv = make_expanding_cv(data_set_size=50,
-                           test_proportion=0.1,
-                           start_train_proportion=0.6,
-                           step_proportion=0.1,
-                           expanding_test_size=True)
-
-    cv = list(cv)
-
-    logger.info("Testing expanding cv with datetime index")
-
-    cv = make_expanding_cv(
-            test_proportion=0.1,
-            start_train_proportion=0.6,
-            step_proportion=0.1,
-            index=pd.date_range(start=pd.to_datetime("2020-01-01"),
-                                periods=50))
-
-    cv = list(cv)
-
-    logger.info("Finish testing")