3 years ago · 9d215fc661
--- a/cdplib/ml_validation/CVComposer.py
+++ b/cdplib/ml_validation/CVComposer.py
@@ -205,3 +205,68 @@ class CVComposer:
 
				             self._logger.log_and_raise_error(
			
 
				                     ("Failed to make sliding window cv. "
			
 
				                      "Exit with error: {}".format(e)))
			
 
				+            
			
 
				+    def nested_expanding_cv(self,
			
 
				+            test_proportion: float,
			
 
				+            start_train_proportion: float,
			
 
				+            step_proportion: float = None,
			
 
				+            expanding_test_size: bool = False,
			
 
				+            data_set_size: Union[float, None] = None,
			
 
				+            index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+            -> Iterable[Tuple[List]]:
			
 
				+        """
			
 
				+        """
			
 
				+        logger = Log("make_nested_expanding_cv:")
			
 
				+    
			
 
				+        try:
			
 
				+            cv = self.expanding_cv(test_proportion=test_proportion,
			
 
				+                                   start_train_proportion=start_train_proportion,
			
 
				+                                   step_proportion=step_proportion,
			
 
				+                                   expanding_test_size=expanding_test_size,
			
 
				+                                   data_set_size=data_set_size,
			
 
				+                                   index=index)
			
 
				+    
			
 
				+            nested_cv = []
			
 
				+    
			
 
				+            for train_inds, test_inds in cv:
			
 
				+    
			
 
				+                fold_index = train_inds if index is not None\
			
 
				+                    else None
			
 
				+    
			
 
				+                fold_size = len(train_inds) if index is None else None
			
 
				+    
			
 
				+                fold_cv = self.expanding_cv(
			
 
				+                        test_proportion=test_proportion,
			
 
				+                        start_train_proportion=start_train_proportion,
			
 
				+                        step_proportion=step_proportion,
			
 
				+                        expanding_test_size=expanding_test_size,
			
 
				+                        data_set_size=fold_size,
			
 
				+                        index=fold_index)
			
 
				+    
			
 
				+                nested_cv.append(list(fold_cv))
			
 
				+    
			
 
				+            return nested_cv
			
 
				+    
			
 
				+        except Exception as e:
			
 
				+            logger.log_and_raise_error(("Failed to make nested expanding cv. "
			
 
				+                                        "Exit with error: {}".format(e)))
			
 
				+    
			
 
				+    
			
 
				+    def cv_slice_dataset(self, X, y, train_inds, test_inds)\
			
 
				+            -> Tuple[Union[pd.DataFrame, np.ndarray],
			
 
				+                     Union[pd.Series, np.ndarray]]:
			
 
				+        """
			
 
				+        """
			
 
				+        if isinstance(X, pd.DataFrame):
			
 
				+            X_train = X.loc[train_inds]
			
 
				+            X_val = X.loc[test_inds]
			
 
				+        else:
			
 
				+            X_train = X[train_inds]
			
 
				+            X_val = X[test_inds]
			
 
				+    
			
 
				+        if y is not None:
			
 
				+            y_train = y[train_inds]
			
 
				+            y_val = y[test_inds]
			
 
				+    
			
 
				+        return X_train, X_val, y_train, y_val
			
 
				+
			
--- a/cdplib/ml_validation/__init__.py
+++ b/cdplib/ml_validation/__init__.py
@@ -1,3 +1,2 @@
 
				 from .cross_validate_with_fine_tuning import *
			
 
				-from .CVComposer import *
			
 
				-from .expanding_cv import *
			
 
				+from .CVComposer import *
			
--- a/cdplib/ml_validation/cross_validate_with_fine_tuning.py
+++ b/cdplib/ml_validation/cross_validate_with_fine_tuning.py
@@ -48,84 +48,27 @@ Created on Thu Oct 29 13:58:23 2020
 
				 
			
 
				 """
			
 
				 
			
 
				+import sys
			
 
				+
			
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				 from itertools import zip_longest
			
 
				-from typing import Union, Callable, Dict, Iterable, Tuple, List
			
 
				+
			
 
				+if sys.version_info >= (3, 8):
			
 
				+    from typing import Callable, Dict, Iterable, Union
			
 
				+else:
			
 
				+    from typing_extensions import Callable, Dict, Iterable, Union
			
 
				+
			
 
				 from copy import deepcopy
			
 
				-from itertools import accumulate, repeat, takewhile, chain
			
 
				 
			
 
				 from sklearn.model_selection import StratifiedKFold
			
 
				 
			
 
				 from cdplib.log import Log
			
 
				 
			
 
				+from cdplib.ml_validation.CVComposer import CVComposer
			
 
				 
			
 
				-# TODO: write with yield !!!!
			
 
				-
			
 
				-def make_nested_expanding_cv(
			
 
				-        test_proportion: float,
			
 
				-        start_train_proportion: float,
			
 
				-        step_proportion: float = None,
			
 
				-        expanding_test_size: bool = False,
			
 
				-        data_set_size: Union[float, None] = None,
			
 
				-        index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				-        -> Iterable[Tuple[List]]:
			
 
				-    """
			
 
				-    """
			
 
				-    logger = Log("make_nested_expanding_cv:")
			
 
				-
			
 
				-    try:
			
 
				-        cv = make_expanding_cv(test_proportion=test_proportion,
			
 
				-                               start_train_proportion=start_train_proportion,
			
 
				-                               step_proportion=step_proportion,
			
 
				-                               expanding_test_size=expanding_test_size,
			
 
				-                               data_set_size=data_set_size,
			
 
				-                               index=index)
			
 
				-
			
 
				-        nested_cv = []
			
 
				-
			
 
				-        for train_inds, test_inds in cv:
			
 
				-
			
 
				-            fold_index = train_inds if index is not None\
			
 
				-                else None
			
 
				-
			
 
				-            fold_size = len(train_inds) if index is None else None
			
 
				-
			
 
				-            fold_cv = make_expanding_cv(
			
 
				-                    test_proportion=test_proportion,
			
 
				-                    start_train_proportion=start_train_proportion,
			
 
				-                    step_proportion=step_proportion,
			
 
				-                    expanding_test_size=expanding_test_size,
			
 
				-                    data_set_size=fold_size,
			
 
				-                    index=fold_index)
			
 
				-
			
 
				-            nested_cv.append(list(fold_cv))
			
 
				-
			
 
				-        return nested_cv
			
 
				-
			
 
				-    except Exception as e:
			
 
				-        logger.log_and_raise_error(("Failed to make nested expanding cv. "
			
 
				-                                    "Exit with error: {}".format(e)))
			
 
				-
			
 
				-
			
 
				-def cv_slice_dataset(X, y, train_inds, test_inds)\
			
 
				-        -> Tuple[Union[pd.DataFrame, np.ndarray],
			
 
				-                 Union[pd.Series, np.ndarray]]:
			
 
				-    """
			
 
				-    """
			
 
				-    if isinstance(X, pd.DataFrame):
			
 
				-        X_train = X.loc[train_inds]
			
 
				-        X_val = X.loc[test_inds]
			
 
				-    else:
			
 
				-        X_train = X[train_inds]
			
 
				-        X_val = X[test_inds]
			
 
				-
			
 
				-    if y is not None:
			
 
				-        y_train = y[train_inds]
			
 
				-        y_val = y[test_inds]
			
 
				-
			
 
				-    return X_train, X_val, y_train, y_val
			
 
				 
			
 
				+# TODO: write with yield !!!!
			
 
				 
			
 
				 def get_optimal_proba_threshold(score_func: Callable,
			
 
				                                 y_true: Union[pd.Series, np.ndarray],
			
@@ -202,11 +145,12 @@ def cross_validate_with_optimal_threshold(
 
				             X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
			
 
				             y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
			
 
				 
			
 
				-            cv_threshold, X_train, y_train = make_dummy_cv(
			
 
				-                    X_train=X_train,
			
 
				+            cv_threshold, X_train, y_train =\
			
 
				+                CVComposer().dummy_cv_and_concatenated_data_set(
			
 
				+                    X_train=X_train, 
			
 
				+                    X_test=X_val_threshold,
			
 
				                     y_train=y_train,
			
 
				-                    X_val=X_val_threshold,
			
 
				-                    y_val=y_val_threshold)
			
 
				+                    y_test=y_val_threshold)
			
 
				         else:
			
 
				 
			
 
				             # if cv_threshold is given, we find the optimal threshold
			
@@ -230,10 +174,11 @@ def cross_validate_with_optimal_threshold(
 
				             print("----- In cv threshold fold")
			
 
				 
			
 
				             X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
			
 
				-                cv_slice_dataset(X=X_train,
			
 
				-                                 y=y_train,
			
 
				-                                 train_inds=train_inds,
			
 
				-                                 test_inds=val_inds)
			
 
				+                CVComposer.cv_slice_dataset(
			
 
				+                    X=X_train,
			
 
				+                    y=y_train,
			
 
				+                    train_inds=train_inds,
			
 
				+                    test_inds=val_inds)
			
 
				 
			
 
				             estimator.fit(X_train_fold, y_train_fold)
			
 
				 
			
@@ -284,10 +229,11 @@ def cross_validate_with_optimal_threshold(
 
				             print("=== In cv fold")
			
 
				 
			
 
				             X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
			
 
				-                cv_slice_dataset(X=X_train,
			
 
				-                                 y=y_train,
			
 
				-                                 train_inds=train_inds,
			
 
				-                                 test_inds=val_inds)
			
 
				+                CVComposer().cv_slice_dataset(
			
 
				+                    X=X_train,
			
 
				+                    y=y_train,
			
 
				+                    train_inds=train_inds,
			
 
				+                    test_inds=val_inds)
			
 
				 
			
 
				             scores = cross_validate_with_optimal_threshold(
			
 
				                     estimator=estimator,
			
--- a/cdplib/ml_validation/expanding_cv.py
+++ b/cdplib/ml_validation/expanding_cv.py
@@ -1,97 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Wed Dec  9 09:55:52 2020
			
 
				-
			
 
				-@author: tanya
			
 
				-"""
			
 
				-
			
 
				-from typing import Union, Iterable, Tuple, List
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				-from itertools import accumulate, repeat, takewhile
			
 
				-
			
 
				-from cdplib.log import Log
			
 
				-
			
 
				-
			
 
				-def make_expanding_cv(test_proportion: float,
			
 
				-                      start_train_proportion: float,
			
 
				-                      step_proportion: float = None,
			
 
				-                      expanding_test_size: bool = False,
			
 
				-                      data_set_size: Union[float, None] = None,
			
 
				-                      index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				-        -> Union[Iterable[Tuple[List]], None]:
			
 
				-    """
			
 
				-
			
 
				-    """
			
 
				-    logger = Log("make_expanding_cv:")
			
 
				-
			
 
				-    try:
			
 
				-        assert((index is None) != (data_set_size is None)),\
			
 
				-            "Set index or data_set_size"
			
 
				-
			
 
				-        index = index if (index is not None)\
			
 
				-            else pd.Series(range(data_set_size))
			
 
				-
			
 
				-        data_set_size = data_set_size or len(index)
			
 
				-
			
 
				-        start_train_size = int(start_train_proportion * data_set_size)
			
 
				-        step_size = int(step_proportion * data_set_size)
			
 
				-
			
 
				-        test_size = int(test_proportion * data_set_size)
			
 
				-
			
 
				-        train_inds_set = (list(range(train_size))
			
 
				-                          for train_size in
			
 
				-                          takewhile(
			
 
				-                                  lambda x: x <= data_set_size - test_size,
			
 
				-                                  accumulate(repeat(start_train_size),
			
 
				-                                             lambda x, _: x + step_size)))
			
 
				-
			
 
				-        for train_inds in train_inds_set:
			
 
				-
			
 
				-            if expanding_test_size:
			
 
				-
			
 
				-                yield (index[train_inds],
			
 
				-                       index[train_inds[-1] + 1:
			
 
				-                             train_inds[-1] + 1
			
 
				-                             + int(test_proportion*len(train_inds))])
			
 
				-
			
 
				-            else:
			
 
				-
			
 
				-                yield (index[train_inds],
			
 
				-                       index[train_inds[-1] + 1:
			
 
				-                             train_inds[-1] + 1 + test_size])
			
 
				-
			
 
				-    except Exception as e:
			
 
				-        logger.log_and_raise_error(("Failed to make expanding cv. "
			
 
				-                                    "Exit with error: {}".format(e)))
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-
			
 
				-    logger = Log("Test_expanding_cv: ")
			
 
				-
			
 
				-    logger.info("Start Testing")
			
 
				-
			
 
				-    logger.info("Testing expanding cv: ")
			
 
				-
			
 
				-    cv = make_expanding_cv(data_set_size=50,
			
 
				-                           test_proportion=0.1,
			
 
				-                           start_train_proportion=0.6,
			
 
				-                           step_proportion=0.1,
			
 
				-                           expanding_test_size=True)
			
 
				-
			
 
				-    cv = list(cv)
			
 
				-
			
 
				-    logger.info("Testing expanding cv with datetime index")
			
 
				-
			
 
				-    cv = make_expanding_cv(
			
 
				-            test_proportion=0.1,
			
 
				-            start_train_proportion=0.6,
			
 
				-            step_proportion=0.1,
			
 
				-            index=pd.date_range(start=pd.to_datetime("2020-01-01"),
			
 
				-                                periods=50))
			
 
				-
			
 
				-    cv = list(cv)
			
 
				-
			
 
				-    logger.info("Finish testing")