|
@@ -91,38 +91,37 @@ def get_optimal_proba_threshold(score_func: Callable,
|
|
|
|
|
|
|
|
|
|
def cross_validate_with_optimal_threshold(
|
|
def cross_validate_with_optimal_threshold(
|
|
|
|
+ score_func_threshold: Callable,
|
|
estimator: object,
|
|
estimator: object,
|
|
- score_func: Callable,
|
|
|
|
- X_train: Union[pd.DataFrame, np.ndarray],
|
|
|
|
- y_train: Union[pd.Series, np.ndarray, None] = None,
|
|
|
|
|
|
+ X: Union[pd.DataFrame, np.ndarray],
|
|
|
|
+ y: Union[pd.Series, np.ndarray, None] = None,
|
|
|
|
+ scoring: Union[Callable, Dict] = None,
|
|
|
|
+ cv: Union[Iterable, int, None] = None,
|
|
X_val: Union[pd.DataFrame, np.ndarray, None] = None,
|
|
X_val: Union[pd.DataFrame, np.ndarray, None] = None,
|
|
y_val: Union[pd.Series, np.ndarray, None] = None,
|
|
y_val: Union[pd.Series, np.ndarray, None] = None,
|
|
X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
|
|
X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
|
|
y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
|
|
y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
|
|
- cv: Union[Iterable, int, None] = None,
|
|
|
|
cv_threshold: Union[Iterable, int, None] = None,
|
|
cv_threshold: Union[Iterable, int, None] = None,
|
|
- additional_metrics: Union[Dict[str, Callable], None] = None,
|
|
|
|
threshold_set: Union[Iterable, None] = None,
|
|
threshold_set: Union[Iterable, None] = None,
|
|
- scores: Dict = None)\
|
|
|
|
- -> Dict:
|
|
|
|
|
|
+ scores: Dict = None)-> Dict:
|
|
"""
|
|
"""
|
|
"""
|
|
"""
|
|
logger = Log("cross_validate_with_optimal_threshold:")
|
|
logger = Log("cross_validate_with_optimal_threshold:")
|
|
|
|
|
|
- X_train = deepcopy(X_train)
|
|
|
|
- y_train = deepcopy(y_train)
|
|
|
|
|
|
+ X_train = deepcopy(X)
|
|
|
|
+ y_train = deepcopy(y)
|
|
X_val = deepcopy(X_val)
|
|
X_val = deepcopy(X_val)
|
|
y_val = deepcopy(y_val)
|
|
y_val = deepcopy(y_val)
|
|
X_val_threshold = deepcopy(X_val_threshold)
|
|
X_val_threshold = deepcopy(X_val_threshold)
|
|
y_val_threshold = deepcopy(y_val_threshold)
|
|
y_val_threshold = deepcopy(y_val_threshold)
|
|
|
|
|
|
scores = scores or {"test_threshold": [],
|
|
scores = scores or {"test_threshold": [],
|
|
- "test_score": [],
|
|
|
|
- "train_score": []}
|
|
|
|
|
|
+ "test_score_threshold": [],
|
|
|
|
+ "train_score_threshold": []}
|
|
|
|
|
|
- additional_metrics = additional_metrics or {}
|
|
|
|
|
|
+ scoring = scoring or {}
|
|
|
|
|
|
- for metric_name, metric in additional_metrics.items():
|
|
|
|
|
|
+ for metric_name, metric in scoring.items():
|
|
if "test_" + metric_name not in scores:
|
|
if "test_" + metric_name not in scores:
|
|
scores["test_" + metric_name] = []
|
|
scores["test_" + metric_name] = []
|
|
scores["train_" + metric_name] = []
|
|
scores["train_" + metric_name] = []
|
|
@@ -182,9 +181,10 @@ def cross_validate_with_optimal_threshold(
|
|
|
|
|
|
proba_val = estimator.predict_proba(X_val_fold)[:, 1]
|
|
proba_val = estimator.predict_proba(X_val_fold)[:, 1]
|
|
|
|
|
|
- threshold = get_optimal_proba_threshold(score_func=score_func,
|
|
|
|
- y_true=y_val_fold,
|
|
|
|
- proba=proba_val)
|
|
|
|
|
|
+ threshold = get_optimal_proba_threshold(
|
|
|
|
+ score_func=score_func_threshold,
|
|
|
|
+ y_true=y_val_fold,
|
|
|
|
+ proba=proba_val)
|
|
|
|
|
|
thresholds.append(threshold)
|
|
thresholds.append(threshold)
|
|
|
|
|
|
@@ -201,15 +201,15 @@ def cross_validate_with_optimal_threshold(
|
|
pred_train = (proba_train >= threshold)
|
|
pred_train = (proba_train >= threshold)
|
|
pred_val = (proba_val >= threshold)
|
|
pred_val = (proba_val >= threshold)
|
|
|
|
|
|
- train_score = score_func(y_train, pred_train)
|
|
|
|
- test_score = score_func(y_val, pred_val)
|
|
|
|
|
|
+ train_score = score_func_threshold(y_train, pred_train)
|
|
|
|
+ test_score = score_func_threshold(y_val, pred_val)
|
|
|
|
|
|
- for metric_name, metric in additional_metrics.items():
|
|
|
|
|
|
+ for metric_name, metric in scoring.items():
|
|
scores["train_" + metric_name].append(metric(y_train, pred_train))
|
|
scores["train_" + metric_name].append(metric(y_train, pred_train))
|
|
scores["test_" + metric_name].append(metric(y_val, pred_val))
|
|
scores["test_" + metric_name].append(metric(y_val, pred_val))
|
|
|
|
|
|
- scores["train_score"].append(train_score)
|
|
|
|
- scores["test_score"].append(test_score)
|
|
|
|
|
|
+ scores["train_score_threshold"].append(train_score)
|
|
|
|
+ scores["test_score_threshold"].append(test_score)
|
|
|
|
|
|
return scores
|
|
return scores
|
|
|
|
|
|
@@ -231,13 +231,13 @@ def cross_validate_with_optimal_threshold(
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
scores = cross_validate_with_optimal_threshold(
|
|
estimator=estimator,
|
|
estimator=estimator,
|
|
- score_func=score_func,
|
|
|
|
- X_train=X_train_fold,
|
|
|
|
- y_train=y_train_fold,
|
|
|
|
|
|
+ score_func_threshold=score_func_threshold,
|
|
|
|
+ X=X_train_fold,
|
|
|
|
+ y=y_train_fold,
|
|
X_val=X_val_fold,
|
|
X_val=X_val_fold,
|
|
y_val=y_val_fold,
|
|
y_val=y_val_fold,
|
|
cv_threshold=cv_fold,
|
|
cv_threshold=cv_fold,
|
|
- additional_metrics=additional_metrics,
|
|
|
|
|
|
+ scoring=scoring,
|
|
threshold_set=threshold_set,
|
|
threshold_set=threshold_set,
|
|
scores=scores)
|
|
scores=scores)
|
|
|
|
|
|
@@ -258,11 +258,12 @@ if __name__ == "__main__":
|
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split(X, y)
|
|
X_train, X_val, y_train, y_val = train_test_split(X, y)
|
|
|
|
|
|
- estimator = XGBRFClassifier(use_label_encoder=False)
|
|
|
|
|
|
+ estimator = XGBRFClassifier(use_label_encoder=False,
|
|
|
|
+ eval_metric="logloss")
|
|
|
|
|
|
score_func = accuracy_score
|
|
score_func = accuracy_score
|
|
|
|
|
|
- additional_metrics = {"precision": precision_score}
|
|
|
|
|
|
+ scoring = {"precision": precision_score}
|
|
|
|
|
|
averaged_scores = []
|
|
averaged_scores = []
|
|
averaged_thresholds = []
|
|
averaged_thresholds = []
|
|
@@ -270,21 +271,21 @@ if __name__ == "__main__":
|
|
print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
|
|
print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
scores = cross_validate_with_optimal_threshold(
|
|
|
|
+ score_func_threshold=accuracy_score,
|
|
estimator=estimator,
|
|
estimator=estimator,
|
|
- score_func=accuracy_score,
|
|
|
|
- X_train=X_train,
|
|
|
|
- y_train=y_train,
|
|
|
|
|
|
+ X=X_train,
|
|
|
|
+ y=y_train,
|
|
|
|
+ scoring=scoring,
|
|
|
|
+ cv=None,
|
|
X_val=X_val,
|
|
X_val=X_val,
|
|
y_val=y_val,
|
|
y_val=y_val,
|
|
X_val_threshold=None,
|
|
X_val_threshold=None,
|
|
y_val_threshold=None,
|
|
y_val_threshold=None,
|
|
- cv=None,
|
|
|
|
- cv_threshold=None,
|
|
|
|
- additional_metrics=additional_metrics)
|
|
|
|
|
|
+ cv_threshold=None)
|
|
|
|
|
|
print("\nScores:", scores)
|
|
print("\nScores:", scores)
|
|
|
|
|
|
- averaged_scores.append(np.mean(scores["test_score"]))
|
|
|
|
|
|
+ averaged_scores.append(np.mean(scores["test_score_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
|
|
|
|
print("\n ########################################################## \n")
|
|
print("\n ########################################################## \n")
|
|
@@ -295,21 +296,21 @@ if __name__ == "__main__":
|
|
print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
|
|
print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
scores = cross_validate_with_optimal_threshold(
|
|
|
|
+ score_func_threshold=accuracy_score,
|
|
estimator=estimator,
|
|
estimator=estimator,
|
|
- score_func=accuracy_score,
|
|
|
|
- X_train=X_train,
|
|
|
|
- y_train=y_train,
|
|
|
|
|
|
+ X=X_train,
|
|
|
|
+ y=y_train,
|
|
|
|
+ scoring=scoring,
|
|
|
|
+ cv=None,
|
|
X_val=X_val,
|
|
X_val=X_val,
|
|
y_val=y_val,
|
|
y_val=y_val,
|
|
X_val_threshold=X_val_threshold,
|
|
X_val_threshold=X_val_threshold,
|
|
y_val_threshold=y_val_threshold,
|
|
y_val_threshold=y_val_threshold,
|
|
- cv=None,
|
|
|
|
- cv_threshold=None,
|
|
|
|
- additional_metrics=additional_metrics)
|
|
|
|
|
|
+ cv_threshold=None)
|
|
|
|
|
|
print("\nScores:", scores)
|
|
print("\nScores:", scores)
|
|
|
|
|
|
- averaged_scores.append(np.mean(scores["test_score"]))
|
|
|
|
|
|
+ averaged_scores.append(np.mean(scores["test_score_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
|
|
|
|
print("\n ########################################################## \n")
|
|
print("\n ########################################################## \n")
|
|
@@ -317,21 +318,21 @@ if __name__ == "__main__":
|
|
print("\nTesting cv=None, cv_threshold=3 \n")
|
|
print("\nTesting cv=None, cv_threshold=3 \n")
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
scores = cross_validate_with_optimal_threshold(
|
|
|
|
+ score_func_threshold=accuracy_score,
|
|
estimator=estimator,
|
|
estimator=estimator,
|
|
- score_func=accuracy_score,
|
|
|
|
- X_train=X_train,
|
|
|
|
- y_train=y_train,
|
|
|
|
|
|
+ X=X_train,
|
|
|
|
+ y=y_train,
|
|
|
|
+ scoring=scoring,
|
|
|
|
+ cv=None,
|
|
X_val=X_val,
|
|
X_val=X_val,
|
|
y_val=y_val,
|
|
y_val=y_val,
|
|
X_val_threshold=X_val_threshold,
|
|
X_val_threshold=X_val_threshold,
|
|
y_val_threshold=y_val_threshold,
|
|
y_val_threshold=y_val_threshold,
|
|
- cv=None,
|
|
|
|
- cv_threshold=3,
|
|
|
|
- additional_metrics=additional_metrics)
|
|
|
|
|
|
+ cv_threshold=3)
|
|
|
|
|
|
print("\nScores:", scores)
|
|
print("\nScores:", scores)
|
|
|
|
|
|
- averaged_scores.append(np.mean(scores["test_score"]))
|
|
|
|
|
|
+ averaged_scores.append(np.mean(scores["test_score_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
|
|
|
|
print("\n ########################################################## \n")
|
|
print("\n ########################################################## \n")
|
|
@@ -339,17 +340,17 @@ if __name__ == "__main__":
|
|
print("\nTesting cv=3, cv_threshold=None \n")
|
|
print("\nTesting cv=3, cv_threshold=None \n")
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
scores = cross_validate_with_optimal_threshold(
|
|
|
|
+ score_func_threshold=accuracy_score,
|
|
estimator=estimator,
|
|
estimator=estimator,
|
|
- score_func=accuracy_score,
|
|
|
|
- X_train=X_train,
|
|
|
|
- y_train=y_train,
|
|
|
|
|
|
+ X=X_train,
|
|
|
|
+ y=y_train,
|
|
|
|
+ scoring=scoring,
|
|
|
|
+ cv=3,
|
|
X_val=None,
|
|
X_val=None,
|
|
y_val=None,
|
|
y_val=None,
|
|
X_val_threshold=None,
|
|
X_val_threshold=None,
|
|
y_val_threshold=None,
|
|
y_val_threshold=None,
|
|
- cv=3,
|
|
|
|
- cv_threshold=None,
|
|
|
|
- additional_metrics=additional_metrics)
|
|
|
|
|
|
+ cv_threshold=None)
|
|
|
|
|
|
print("\nScores:", scores)
|
|
print("\nScores:", scores)
|
|
|
|
|
|
@@ -358,21 +359,21 @@ if __name__ == "__main__":
|
|
print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
|
|
print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
|
|
|
|
|
|
scores = cross_validate_with_optimal_threshold(
|
|
scores = cross_validate_with_optimal_threshold(
|
|
|
|
+ score_func_threshold=accuracy_score,
|
|
estimator=estimator,
|
|
estimator=estimator,
|
|
- score_func=accuracy_score,
|
|
|
|
- X_train=X_train,
|
|
|
|
- y_train=y_train,
|
|
|
|
|
|
+ X=X_train,
|
|
|
|
+ y=y_train,
|
|
|
|
+ scoring=scoring,
|
|
|
|
+ cv=3,
|
|
X_val=X_val,
|
|
X_val=X_val,
|
|
y_val=y_val,
|
|
y_val=y_val,
|
|
X_val_threshold=X_val_threshold,
|
|
X_val_threshold=X_val_threshold,
|
|
y_val_threshold=y_val_threshold,
|
|
y_val_threshold=y_val_threshold,
|
|
- cv=3,
|
|
|
|
- cv_threshold=[3, 3, 3],
|
|
|
|
- additional_metrics=additional_metrics)
|
|
|
|
|
|
+ cv_threshold=[3, 3, 3])
|
|
|
|
|
|
print("\nScores:", scores)
|
|
print("\nScores:", scores)
|
|
|
|
|
|
- averaged_scores.append(np.mean(scores["test_score"]))
|
|
|
|
|
|
+ averaged_scores.append(np.mean(scores["test_score_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
averaged_thresholds.append(np.mean(scores["test_threshold"]))
|
|
|
|
|
|
print("\n ########################################################## \n")
|
|
print("\n ########################################################## \n")
|