|
@@ -13,10 +13,13 @@ from xgboost import XGBRFClassifier
|
|
|
from sklearn.svm import SVC
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
from sklearn.decomposition import PCA
|
|
|
+from sklearn.pipeline import Pipeline
|
|
|
+from sklearn.preprocessing import StandardScaler
|
|
|
from hyperopt import hp
|
|
|
|
|
|
-from cdplib.hyperopt.SpaceComposer import SpaceComposer
|
|
|
+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
|
|
|
|
|
|
+# TODO: add sample spaces for encoders and transformers
|
|
|
|
|
|
encoders = []
|
|
|
|
|
@@ -26,46 +29,49 @@ selectors = [
|
|
|
{"name": "kbest",
|
|
|
"object": SelectPercentile(),
|
|
|
"params": {
|
|
|
- "percentile": 3 + hp.randint("kbest__percentile", 200),
|
|
|
+ "percentile": 3 + hp.randint("kbest__percentile", 60),
|
|
|
"score_func": hp.choice("kbest__score_func",
|
|
|
[f_classif, chi2, mutual_info_classif])}},
|
|
|
|
|
|
-# {"name": "fpr",
|
|
|
-# "object": SelectFpr(),
|
|
|
-# "params": {
|
|
|
-# "score_func": hp.choice("fpr__score_func",
|
|
|
-# [f_classif, chi2, mutual_info_classif]),
|
|
|
-# "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
|
|
|
-
|
|
|
-# {"name": "rfe_rf",
|
|
|
-# "object":
|
|
|
-# RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
|
|
|
-# "params": {
|
|
|
-# "n_features_to_select":
|
|
|
-# 3 + hp.randint("rfe_rf__n_features_to_select", 200),
|
|
|
-# "estimator__n_estimators":
|
|
|
-# 20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
|
|
|
-
|
|
|
-# {"name": "rfm_rf",
|
|
|
-# "object":
|
|
|
-# SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
|
|
|
-# random_state=33)),
|
|
|
-# "params": {
|
|
|
-# "estimator__n_estimators":
|
|
|
-# 20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
|
|
|
-
|
|
|
-# {"name": "rfm_lr",
|
|
|
-# "object":
|
|
|
-# SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
|
|
|
-# random_state=33)),
|
|
|
-# "params": {
|
|
|
-# "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
|
|
|
-
|
|
|
-# {"name": "pca",
|
|
|
-# "object": PCA(random_state=33),
|
|
|
-# "params": {
|
|
|
-# "n_components": 3 + hp.randint("pca__n_components", 20)
|
|
|
-# }}
|
|
|
+ {"name": "fpr",
|
|
|
+ "object": SelectFpr(),
|
|
|
+ "params": {
|
|
|
+ "score_func": hp.choice("fpr__score_func",
|
|
|
+ [f_classif, chi2]),
|
|
|
+ # mutual_info_classif does not work here
|
|
|
+ "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
|
|
|
+
|
|
|
+ {"name": "rfe_rf",
|
|
|
+ "object":
|
|
|
+ RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
|
|
|
+ "params": {
|
|
|
+ "n_features_to_select":
|
|
|
+ 3 + hp.randint("rfe_rf__n_features_to_select", 200),
|
|
|
+ "estimator__n_estimators":
|
|
|
+ 20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
|
|
|
+
|
|
|
+ {"name": "rfm_rf",
|
|
|
+ "object":
|
|
|
+ SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
|
|
|
+ random_state=33)),
|
|
|
+ "params": {
|
|
|
+ "estimator__n_estimators":
|
|
|
+ 20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
|
|
|
+
|
|
|
+ {"name": "rfm_lr",
|
|
|
+ "object":
|
|
|
+ SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
|
|
|
+ random_state=33)),
|
|
|
+ "params": {
|
|
|
+ "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
|
|
|
+
|
|
|
+ {"name": "std_scaler_pca",
|
|
|
+ "object": Pipeline([
|
|
|
+ ("scaler", StandardScaler()),
|
|
|
+ ("pca", PCA(random_state=33))]),
|
|
|
+ "params": {
|
|
|
+ "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
|
|
|
+ }}
|
|
|
]
|
|
|
|
|
|
models = [
|
|
@@ -77,27 +83,30 @@ models = [
|
|
|
"learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
|
|
|
}},
|
|
|
|
|
|
-# {"name": "rf",
|
|
|
-# "object": RandomForestClassifier(n_jobs=-1, random_state=33),
|
|
|
-# "params": {
|
|
|
-# "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
|
|
|
-# "max_depth": 3 + hp.randint("rf__max_depth", 10),
|
|
|
-# "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
|
|
|
-# }},
|
|
|
-
|
|
|
-# {"name": "lr",
|
|
|
-# "object": LogisticRegression(n_jobs=-1, random_state=33),
|
|
|
-# "params": {
|
|
|
-# "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
|
|
|
-# "C": hp.uniform("lr__C", 0.1, 1000)}},
|
|
|
-
|
|
|
-# {"name": "svc",
|
|
|
-# "object": SVC(random_state=33),
|
|
|
-# "params": {
|
|
|
-# "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
|
|
|
-# "degree": 2 + hp.randint("svc__degree", 3),
|
|
|
-# "C": hp.uniform("svc__C", 0.1, 1000)
|
|
|
-# }}
|
|
|
+ {"name": "rf",
|
|
|
+ "object": RandomForestClassifier(n_jobs=-1, random_state=33),
|
|
|
+ "params": {
|
|
|
+ "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
|
|
|
+ "max_depth": 3 + hp.randint("rf__max_depth", 10),
|
|
|
+ "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
|
|
|
+ }},
|
|
|
+
|
|
|
+ # the default solver does not accept l1 penalty
|
|
|
+ {"name": "lr",
|
|
|
+ "object": LogisticRegression(n_jobs=-1, random_state=33,
|
|
|
+ solver='liblinear'),
|
|
|
+ "params": {
|
|
|
+ "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
|
|
|
+ "C": hp.uniform("lr__C", 0.1, 1000)}},
|
|
|
+
|
|
|
+ # svc does not support parallelizaiton, therefore is slow
|
|
|
+ {"name": "svc",
|
|
|
+ "object": SVC(random_state=33),
|
|
|
+ "params": {
|
|
|
+ "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
|
|
|
+ "degree": 2 + hp.randint("svc__degree", 3),
|
|
|
+ "C": hp.uniform("svc__C", 0.1, 1000)
|
|
|
+ }}
|
|
|
]
|
|
|
|
|
|
step_list = [encoders, transformers, selectors, models]
|