Browse Source

made the signature of the cross_validate_with_fine_tuning same as in sklearn

tanja 3 years ago
parent
commit
96c53164fd
1 changed files with 62 additions and 61 deletions
  1. 62 61
      cdplib/ml_validation/cross_validate_with_fine_tuning.py

+ 62 - 61
cdplib/ml_validation/cross_validate_with_fine_tuning.py

@@ -91,38 +91,37 @@ def get_optimal_proba_threshold(score_func: Callable,
 
 
 def cross_validate_with_optimal_threshold(
+        score_func_threshold: Callable,
         estimator: object,
-        score_func: Callable,
-        X_train: Union[pd.DataFrame, np.ndarray],
-        y_train: Union[pd.Series, np.ndarray, None] = None,
+        X: Union[pd.DataFrame, np.ndarray],
+        y: Union[pd.Series, np.ndarray, None] = None,
+        scoring: Union[Callable, Dict] = None,
+        cv: Union[Iterable, int, None] = None,
         X_val: Union[pd.DataFrame, np.ndarray, None] = None,
         y_val: Union[pd.Series, np.ndarray, None] = None,
         X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
         y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
-        cv: Union[Iterable, int, None] = None,
         cv_threshold: Union[Iterable, int, None] = None,
-        additional_metrics: Union[Dict[str, Callable], None] = None,
         threshold_set: Union[Iterable, None] = None,
-        scores: Dict = None)\
-            -> Dict:
+        scores: Dict = None)-> Dict:
     """
     """
     logger = Log("cross_validate_with_optimal_threshold:")
 
-    X_train = deepcopy(X_train)
-    y_train = deepcopy(y_train)
+    X_train = deepcopy(X)
+    y_train = deepcopy(y)
     X_val = deepcopy(X_val)
     y_val = deepcopy(y_val)
     X_val_threshold = deepcopy(X_val_threshold)
     y_val_threshold = deepcopy(y_val_threshold)
 
     scores = scores or {"test_threshold": [],
-                        "test_score": [],
-                        "train_score": []}
+                        "test_score_threshold": [],
+                        "train_score_threshold": []}
 
-    additional_metrics = additional_metrics or {}
+    scoring = scoring or {}
 
-    for metric_name, metric in additional_metrics.items():
+    for metric_name, metric in scoring.items():
         if "test_" + metric_name not in scores:
             scores["test_" + metric_name] = []
             scores["train_" + metric_name] = []
@@ -182,9 +181,10 @@ def cross_validate_with_optimal_threshold(
 
             proba_val = estimator.predict_proba(X_val_fold)[:, 1]
 
-            threshold = get_optimal_proba_threshold(score_func=score_func,
-                                                    y_true=y_val_fold,
-                                                    proba=proba_val)
+            threshold = get_optimal_proba_threshold(
+                score_func=score_func_threshold,
+                y_true=y_val_fold,
+                proba=proba_val)
 
             thresholds.append(threshold)
 
@@ -201,15 +201,15 @@ def cross_validate_with_optimal_threshold(
         pred_train = (proba_train >= threshold)
         pred_val = (proba_val >= threshold)
 
-        train_score = score_func(y_train, pred_train)
-        test_score = score_func(y_val, pred_val)
+        train_score = score_func_threshold(y_train, pred_train)
+        test_score = score_func_threshold(y_val, pred_val)
 
-        for metric_name, metric in additional_metrics.items():
+        for metric_name, metric in scoring.items():
             scores["train_" + metric_name].append(metric(y_train, pred_train))
             scores["test_" + metric_name].append(metric(y_val, pred_val))
 
-        scores["train_score"].append(train_score)
-        scores["test_score"].append(test_score)
+        scores["train_score_threshold"].append(train_score)
+        scores["test_score_threshold"].append(test_score)
 
         return scores
 
@@ -231,13 +231,13 @@ def cross_validate_with_optimal_threshold(
 
             scores = cross_validate_with_optimal_threshold(
                     estimator=estimator,
-                    score_func=score_func,
-                    X_train=X_train_fold,
-                    y_train=y_train_fold,
+                    score_func_threshold=score_func_threshold,
+                    X=X_train_fold,
+                    y=y_train_fold,
                     X_val=X_val_fold,
                     y_val=y_val_fold,
                     cv_threshold=cv_fold,
-                    additional_metrics=additional_metrics,
+                    scoring=scoring,
                     threshold_set=threshold_set,
                     scores=scores)
 
@@ -258,11 +258,12 @@ if __name__ == "__main__":
 
     X_train, X_val, y_train, y_val = train_test_split(X, y)
 
-    estimator = XGBRFClassifier(use_label_encoder=False)
+    estimator = XGBRFClassifier(use_label_encoder=False,
+                                eval_metric="logloss")
 
     score_func = accuracy_score
 
-    additional_metrics = {"precision": precision_score}
+    scoring = {"precision": precision_score}
 
     averaged_scores = []
     averaged_thresholds = []
@@ -270,21 +271,21 @@ if __name__ == "__main__":
     print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
 
     scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
             estimator=estimator,
-            score_func=accuracy_score,
-            X_train=X_train,
-            y_train=y_train,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=None,
             X_val=X_val,
             y_val=y_val,
             X_val_threshold=None,
             y_val_threshold=None,
-            cv=None,
-            cv_threshold=None,
-            additional_metrics=additional_metrics)
+            cv_threshold=None)
 
     print("\nScores:", scores)
 
-    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
     averaged_thresholds.append(np.mean(scores["test_threshold"]))
 
     print("\n ########################################################## \n")
@@ -295,21 +296,21 @@ if __name__ == "__main__":
     print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
 
     scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
             estimator=estimator,
-            score_func=accuracy_score,
-            X_train=X_train,
-            y_train=y_train,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=None,
             X_val=X_val,
             y_val=y_val,
             X_val_threshold=X_val_threshold,
             y_val_threshold=y_val_threshold,
-            cv=None,
-            cv_threshold=None,
-            additional_metrics=additional_metrics)
+            cv_threshold=None)
 
     print("\nScores:", scores)
 
-    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
     averaged_thresholds.append(np.mean(scores["test_threshold"]))
 
     print("\n ########################################################## \n")
@@ -317,21 +318,21 @@ if __name__ == "__main__":
     print("\nTesting cv=None, cv_threshold=3 \n")
 
     scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
             estimator=estimator,
-            score_func=accuracy_score,
-            X_train=X_train,
-            y_train=y_train,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=None,
             X_val=X_val,
             y_val=y_val,
             X_val_threshold=X_val_threshold,
             y_val_threshold=y_val_threshold,
-            cv=None,
-            cv_threshold=3,
-            additional_metrics=additional_metrics)
+            cv_threshold=3)
 
     print("\nScores:", scores)
 
-    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
     averaged_thresholds.append(np.mean(scores["test_threshold"]))
 
     print("\n ########################################################## \n")
@@ -339,17 +340,17 @@ if __name__ == "__main__":
     print("\nTesting cv=3, cv_threshold=None \n")
 
     scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
             estimator=estimator,
-            score_func=accuracy_score,
-            X_train=X_train,
-            y_train=y_train,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=3,
             X_val=None,
             y_val=None,
             X_val_threshold=None,
             y_val_threshold=None,
-            cv=3,
-            cv_threshold=None,
-            additional_metrics=additional_metrics)
+            cv_threshold=None)
 
     print("\nScores:", scores)
 
@@ -358,21 +359,21 @@ if __name__ == "__main__":
     print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
 
     scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
             estimator=estimator,
-            score_func=accuracy_score,
-            X_train=X_train,
-            y_train=y_train,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=3,
             X_val=X_val,
             y_val=y_val,
             X_val_threshold=X_val_threshold,
             y_val_threshold=y_val_threshold,
-            cv=3,
-            cv_threshold=[3, 3, 3],
-            additional_metrics=additional_metrics)
+            cv_threshold=[3, 3, 3])
 
     print("\nScores:", scores)
 
-    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
     averaged_thresholds.append(np.mean(scores["test_threshold"]))
 
     print("\n ########################################################## \n")