Bläddra i källkod

added composed_space_sample

tanja 3 år sedan
förälder
incheckning
28651a6840

+ 12 - 6
cdplib/hyperopt/HyperoptPipelineSelector.py

@@ -270,15 +270,20 @@ class HyperoptPipelineSelector(PipelineSelector):
             self._logger.log_and_throw_warning("Trials object is empty")
             return {}
         else:
-            try:
-                assert(self.attached_space)
-            except AssertionError:
-                err = "Space is not attached"
-
             try:
                 best_trial = deepcopy(self._trials.best_trial)
 
-                space_element = self._get_space_element_from_trial(best_trial)
+                if self.attached_space:
+
+                    space_element = self._get_space_element_from_trial(
+                            best_trial)
+                else:
+                    space_element = {}
+
+                    warn = ("Space is not attached, "
+                            "To included the best pipeline "
+                            "attach the space")
+                    self._logger.log_and_throw_warning(warn)
 
                 best_trial = deepcopy(self._trials.best_trial["result"])
 
@@ -402,6 +407,7 @@ if __name__ == '__main__':
     from cdplib.log import Log
     from cdplib.db_handlers import MongodbHandler
     from cdplib.hyperopt.space_sample import space
+    # from cdplib.hyperopt.composed_space_sample import space
 
     trials_path = "hyperopt_trials_TEST.pkl"
     additional_metrics = {"precision": precision_score}

+ 68 - 59
cdplib/hyperopt/composed_space_sample.py

@@ -13,10 +13,13 @@ from xgboost import XGBRFClassifier
 from sklearn.svm import SVC
 from sklearn.linear_model import LogisticRegression
 from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
 from hyperopt import hp
 
-from cdplib.hyperopt.SpaceComposer import SpaceComposer
+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
 
+# TODO: add sample spaces for encoders and transformers
 
 encoders = []
 
@@ -26,46 +29,49 @@ selectors = [
     {"name": "kbest",
      "object": SelectPercentile(),
      "params": {
-       "percentile": 3 + hp.randint("kbest__percentile", 200),
+       "percentile": 3 + hp.randint("kbest__percentile", 60),
        "score_func": hp.choice("kbest__score_func",
                                [f_classif, chi2, mutual_info_classif])}},
 
-#    {"name": "fpr",
-#     "object": SelectFpr(),
-#     "params": {
-#        "score_func": hp.choice("fpr__score_func",
-#                                [f_classif, chi2, mutual_info_classif]),
-#        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
-
-#    {"name": "rfe_rf",
-#     "object":
-#         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
-#     "params": {
-#         "n_features_to_select":
-#             3 + hp.randint("rfe_rf__n_features_to_select", 200),
-#         "estimator__n_estimators":
-#             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
-
-#    {"name": "rfm_rf",
-#     "object":
-#         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
-#                                                          random_state=33)),
-#     "params": {
-#         "estimator__n_estimators":
-#             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
-
-#    {"name": "rfm_lr",
-#     "object":
-#         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
-#                                                      random_state=33)),
-#     "params": {
-#          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
-
-#    {"name": "pca",
-#     "object": PCA(random_state=33),
-#     "params": {
-#       "n_components": 3 + hp.randint("pca__n_components", 20)
-#       }}
+    {"name": "fpr",
+     "object": SelectFpr(),
+     "params": {
+        "score_func": hp.choice("fpr__score_func",
+                                [f_classif, chi2]),
+        # mutual_info_classif does not work here
+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
+
+    {"name": "rfe_rf",
+     "object":
+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
+     "params": {
+         "n_features_to_select":
+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
+         "estimator__n_estimators":
+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_rf",
+     "object":
+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
+                                                          random_state=33)),
+     "params": {
+         "estimator__n_estimators":
+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_lr",
+     "object":
+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
+                                                      random_state=33)),
+     "params": {
+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
+
+    {"name": "std_scaler_pca",
+     "object": Pipeline([
+             ("scaler", StandardScaler()),
+             ("pca", PCA(random_state=33))]),
+     "params": {
+        "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
+       }}
     ]
 
 models = [
@@ -77,27 +83,30 @@ models = [
            "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
            }},
 
-#        {"name": "rf",
-#         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
-#         "params": {
-#           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
-#           "max_depth": 3 + hp.randint("rf__max_depth", 10),
-#           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
-#           }},
-
-#        {"name": "lr",
-#         "object": LogisticRegression(n_jobs=-1, random_state=33),
-#         "params":  {
-#           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
-#           "C": hp.uniform("lr__C", 0.1, 1000)}},
-
-#        {"name": "svc",
-#         "object": SVC(random_state=33),
-#         "params": {
-#            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
-#            "degree": 2 + hp.randint("svc__degree", 3),
-#            "C": hp.uniform("svc__C", 0.1, 1000)
-#            }}
+        {"name": "rf",
+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
+           }},
+
+        # the default solver does not accept l1 penalty
+        {"name": "lr",
+         "object": LogisticRegression(n_jobs=-1, random_state=33,
+                                      solver='liblinear'),
+         "params":  {
+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
+           "C": hp.uniform("lr__C", 0.1, 1000)}},
+
+        # svc does not support parallelizaiton, therefore is slow
+        {"name": "svc",
+         "object": SVC(random_state=33),
+         "params": {
+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
+            "degree": 2 + hp.randint("svc__degree", 3),
+            "C": hp.uniform("svc__C", 0.1, 1000)
+            }}
         ]
 
 step_list = [encoders, transformers, selectors, models]

+ 0 - 107
cdplib/hyperopt/space_SAMPLE.py

@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Sep 30 13:58:39 2020
-
-@author: tanya
-@description: a sample space of scikit learn pipelines
- to pass to the HyperoptPipelineSelection class
-
-"""
-
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.feature_selection import SelectFromModel, SelectKBest,\
-    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
-from xgboost import XGBRFClassifier
-from sklearn.svm import SVC
-from sklearn.linear_model import LogisticRegression
-from sklearn.decomposition import PCA
-from hyperopt import hp
-
-from cdplib.hyperopt.SpaceComposer import space_composer
-
-encoders = []
-
-transformers = []
-
-selectors = [
-    {"name": "kbest",
-     "object": SelectKBest(),
-     "params": {
-       "k": 3 + hp.randint("kbest__k", 200),
-       "score_func": hp.choice("kbest__score_func",
-                               [f_classif, chi2, mutual_info_classif])}},
-
-    {"name": "fpr",
-     "object": SelectFpr(),
-     "params": {
-        "score_func": hp.choice("fpr__score_func",
-                                [f_classif, chi2, mutual_info_classif]),
-        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
-
-    {"name": "rfe_rf",
-     "object":
-         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
-     "params": {
-         "n_features_to_select":
-             3 + hp.randint("rfe_rf__n_features_to_select", 200),
-         "estimator__n_estimators":
-             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
-
-    {"name": "rfm_rf",
-     "object":
-         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
-                                                          random_state=33)),
-     "params": {
-         "estimator__n_estimators":
-             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
-
-    {"name": "rfm_lr",
-     "object":
-         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
-                                                      random_state=33)),
-     "params": {
-          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
-
-    {"name": "pca",
-     "object": PCA(random_state=33),
-     "params": {
-       "n_components": 3 + hp.randint("pca__n_components", 20)
-       }}
-    ]
-
-models = [
-        {"name": "xgb",
-         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
-         "params": {
-           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
-           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
-           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
-           }},
-
-        {"name": "rf",
-         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
-         "params": {
-           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
-           "max_depth": 3 + hp.randint("rf__max_depth", 10),
-           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
-           }},
-
-        {"name": "lr",
-         "object": LogisticRegression(n_jobs=-1, random_state=33),
-         "params":  {
-           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
-           "C": hp.uniform("lr__C", 0.1, 1000)}},
-
-        {"name": "svc",
-         "object": SVC(random_state=33),
-         "params": {
-            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
-            "degree": 2 + hp.randint("svc__degree", 3),
-            "C": hp.uniform("svc__C", 0.1, 1000)
-            }}
-        ]
-
-step_list = [encoders, transformers, selectors, models]
-
-space = space_composer(step_list)

+ 0 - 105
cdplib/hyperopt/space_sample_advanced.py

@@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jul  6 14:02:24 2020
-
-@author: tanya
-@description: space object to pass to HyperoptPipelineSelection class
-"""
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
-    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
-from xgboost import XGBRFClassifier
-from sklearn.svm import SVC
-from sklearn.linear_model import LogisticRegression
-from sklearn.decomposition import PCA
-from hyperopt import hp
-
-from cdplib.hyperopt.SpaceComposer import space_composer
-
-
-encoders = []
-
-transformers = []
-
-selectors = [
-    {"name": "kbest",
-     "object": SelectPercentile(),
-     "params": {
-       "k": 3 + hp.randint("kbest__percentile", 200),
-       "score_func": hp.choice("kbest__score_func",
-                               [f_classif, chi2, mutual_info_classif])}},
-
-    {"name": "fpr",
-     "object": SelectFpr(),
-     "params": {
-        "score_func": hp.choice("fpr__score_func",
-                                [f_classif, chi2, mutual_info_classif]),
-        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
-
-    {"name": "rfe_rf",
-     "object":
-         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
-     "params": {
-         "n_features_to_select":
-             3 + hp.randint("rfe_rf__n_features_to_select", 200),
-         "estimator__n_estimators":
-             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
-
-    {"name": "rfm_rf",
-     "object":
-         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
-                                                          random_state=33)),
-     "params": {
-         "estimator__n_estimators":
-             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
-
-    {"name": "rfm_lr",
-     "object":
-         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
-                                                      random_state=33)),
-     "params": {
-          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
-
-    {"name": "pca",
-     "object": PCA(random_state=33),
-     "params": {
-       "n_components": 3 + hp.randint("pca__n_components", 20)
-       }}
-    ]
-
-models = [
-        {"name": "xgb",
-         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
-         "params": {
-           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
-           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
-           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
-           }},
-
-        {"name": "rf",
-         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
-         "params": {
-           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
-           "max_depth": 3 + hp.randint("rf__max_depth", 10),
-           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
-           }},
-
-        {"name": "lr",
-         "object": LogisticRegression(n_jobs=-1, random_state=33),
-         "params":  {
-           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
-           "C": hp.uniform("lr__C", 0.1, 1000)}},
-
-        {"name": "svc",
-         "object": SVC(random_state=33),
-         "params": {
-            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
-            "degree": 2 + hp.randint("svc__degree", 3),
-            "C": hp.uniform("svc__C", 0.1, 1000)
-            }}
-        ]
-
-step_list = [encoders, transformers, selectors, models]
-
-space = space_composer(step_list)

+ 5 - 5
cdplib/pipeline_selector/PipelineSelector.py

@@ -134,6 +134,11 @@ class PipelineSelector(ABC):
 
         ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
 
+        self.attached_space = False
+        self.attached_data = False
+        self.configured_cross_validation = False
+        self.configured_summary_saving = False
+
         self._cost_func = cost_func
         # score factor is 1 when cost_func is minimized,
         # -1 when cost func is maximized
@@ -183,11 +188,6 @@ class PipelineSelector(ABC):
             self._trials = None
             self._start_iteration = 0
 
-        self.attached_space = False
-        self.attached_data = False
-        self.configured_cross_validation = False
-        self.configured_summary_saving = False
-
         # keeping track of the current search iteration
         self._iteration = self._start_iteration
         self._score_improved = False