|
@@ -48,84 +48,27 @@ Created on Thu Oct 29 13:58:23 2020
|
|
|
|
|
|
"""
|
|
|
|
|
|
+import sys
|
|
|
+
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
from itertools import zip_longest
|
|
|
-from typing import Union, Callable, Dict, Iterable, Tuple, List
|
|
|
+
|
|
|
+if sys.version_info >= (3, 8):
|
|
|
+ from typing import Callable, Dict, Iterable, Union
|
|
|
+else:
|
|
|
+ from typing_extensions import Callable, Dict, Iterable, Union
|
|
|
+
|
|
|
from copy import deepcopy
|
|
|
-from itertools import accumulate, repeat, takewhile, chain
|
|
|
|
|
|
from sklearn.model_selection import StratifiedKFold
|
|
|
|
|
|
from cdplib.log import Log
|
|
|
|
|
|
+from cdplib.ml_validation.CVComposer import CVComposer
|
|
|
|
|
|
-# TODO: write with yield !!!!
|
|
|
-
|
|
|
-def make_nested_expanding_cv(
|
|
|
- test_proportion: float,
|
|
|
- start_train_proportion: float,
|
|
|
- step_proportion: float = None,
|
|
|
- expanding_test_size: bool = False,
|
|
|
- data_set_size: Union[float, None] = None,
|
|
|
- index: Union[pd.Series, np.ndarray, list, None] = None)\
|
|
|
- -> Iterable[Tuple[List]]:
|
|
|
- """
|
|
|
- """
|
|
|
- logger = Log("make_nested_expanding_cv:")
|
|
|
-
|
|
|
- try:
|
|
|
- cv = make_expanding_cv(test_proportion=test_proportion,
|
|
|
- start_train_proportion=start_train_proportion,
|
|
|
- step_proportion=step_proportion,
|
|
|
- expanding_test_size=expanding_test_size,
|
|
|
- data_set_size=data_set_size,
|
|
|
- index=index)
|
|
|
-
|
|
|
- nested_cv = []
|
|
|
-
|
|
|
- for train_inds, test_inds in cv:
|
|
|
-
|
|
|
- fold_index = train_inds if index is not None\
|
|
|
- else None
|
|
|
-
|
|
|
- fold_size = len(train_inds) if index is None else None
|
|
|
-
|
|
|
- fold_cv = make_expanding_cv(
|
|
|
- test_proportion=test_proportion,
|
|
|
- start_train_proportion=start_train_proportion,
|
|
|
- step_proportion=step_proportion,
|
|
|
- expanding_test_size=expanding_test_size,
|
|
|
- data_set_size=fold_size,
|
|
|
- index=fold_index)
|
|
|
-
|
|
|
- nested_cv.append(list(fold_cv))
|
|
|
-
|
|
|
- return nested_cv
|
|
|
-
|
|
|
- except Exception as e:
|
|
|
- logger.log_and_raise_error(("Failed to make nested expanding cv. "
|
|
|
- "Exit with error: {}".format(e)))
|
|
|
-
|
|
|
-
|
|
|
-def cv_slice_dataset(X, y, train_inds, test_inds)\
|
|
|
- -> Tuple[Union[pd.DataFrame, np.ndarray],
|
|
|
- Union[pd.Series, np.ndarray]]:
|
|
|
- """
|
|
|
- """
|
|
|
- if isinstance(X, pd.DataFrame):
|
|
|
- X_train = X.loc[train_inds]
|
|
|
- X_val = X.loc[test_inds]
|
|
|
- else:
|
|
|
- X_train = X[train_inds]
|
|
|
- X_val = X[test_inds]
|
|
|
-
|
|
|
- if y is not None:
|
|
|
- y_train = y[train_inds]
|
|
|
- y_val = y[test_inds]
|
|
|
-
|
|
|
- return X_train, X_val, y_train, y_val
|
|
|
|
|
|
+# TODO: write with yield !!!!
|
|
|
|
|
|
def get_optimal_proba_threshold(score_func: Callable,
|
|
|
y_true: Union[pd.Series, np.ndarray],
|
|
@@ -202,11 +145,12 @@ def cross_validate_with_optimal_threshold(
|
|
|
X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
|
|
|
y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
|
|
|
|
|
|
- cv_threshold, X_train, y_train = make_dummy_cv(
|
|
|
- X_train=X_train,
|
|
|
+ cv_threshold, X_train, y_train =\
|
|
|
+ CVComposer().dummy_cv_and_concatenated_data_set(
|
|
|
+ X_train=X_train,
|
|
|
+ X_test=X_val_threshold,
|
|
|
y_train=y_train,
|
|
|
- X_val=X_val_threshold,
|
|
|
- y_val=y_val_threshold)
|
|
|
+ y_test=y_val_threshold)
|
|
|
else:
|
|
|
|
|
|
# if cv_threshold is given, we find the optimal threshold
|
|
@@ -230,10 +174,11 @@ def cross_validate_with_optimal_threshold(
|
|
|
print("----- In cv threshold fold")
|
|
|
|
|
|
X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
|
|
|
- cv_slice_dataset(X=X_train,
|
|
|
- y=y_train,
|
|
|
- train_inds=train_inds,
|
|
|
- test_inds=val_inds)
|
|
|
+ CVComposer.cv_slice_dataset(
|
|
|
+ X=X_train,
|
|
|
+ y=y_train,
|
|
|
+ train_inds=train_inds,
|
|
|
+ test_inds=val_inds)
|
|
|
|
|
|
estimator.fit(X_train_fold, y_train_fold)
|
|
|
|
|
@@ -284,10 +229,11 @@ def cross_validate_with_optimal_threshold(
|
|
|
print("=== In cv fold")
|
|
|
|
|
|
X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
|
|
|
- cv_slice_dataset(X=X_train,
|
|
|
- y=y_train,
|
|
|
- train_inds=train_inds,
|
|
|
- test_inds=val_inds)
|
|
|
+ CVComposer().cv_slice_dataset(
|
|
|
+ X=X_train,
|
|
|
+ y=y_train,
|
|
|
+ train_inds=train_inds,
|
|
|
+ test_inds=val_inds)
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
|
estimator=estimator,
|