4 years ago · 28651a6840
--- a/cdplib/hyperopt/HyperoptPipelineSelector.py
+++ b/cdplib/hyperopt/HyperoptPipelineSelector.py
@@ -270,15 +270,20 @@ class HyperoptPipelineSelector(PipelineSelector):
 
				             self._logger.log_and_throw_warning("Trials object is empty")
			
 
				             return {}
			
 
				         else:
			
 
				-            try:
			
 
				-                assert(self.attached_space)
			
 
				-            except AssertionError:
			
 
				-                err = "Space is not attached"
			
 
				-
			
 
				             try:
			
 
				                 best_trial = deepcopy(self._trials.best_trial)
			
 
				 
			
 
				-                space_element = self._get_space_element_from_trial(best_trial)
			
 
				+                if self.attached_space:
			
 
				+
			
 
				+                    space_element = self._get_space_element_from_trial(
			
 
				+                            best_trial)
			
 
				+                else:
			
 
				+                    space_element = {}
			
 
				+
			
 
				+                    warn = ("Space is not attached, "
			
 
				+                            "To included the best pipeline "
			
 
				+                            "attach the space")
			
 
				+                    self._logger.log_and_throw_warning(warn)
			
 
				 
			
 
				                 best_trial = deepcopy(self._trials.best_trial["result"])
			
 
				 
			
@@ -402,6 +407,7 @@ if __name__ == '__main__':
 
				     from cdplib.log import Log
			
 
				     from cdplib.db_handlers import MongodbHandler
			
 
				     from cdplib.hyperopt.space_sample import space
			
 
				+    # from cdplib.hyperopt.composed_space_sample import space
			
 
				 
			
 
				     trials_path = "hyperopt_trials_TEST.pkl"
			
 
				     additional_metrics = {"precision": precision_score}
			
--- a/cdplib/hyperopt/composed_space_sample.py
+++ b/cdplib/hyperopt/composed_space_sample.py
@@ -13,10 +13,13 @@ from xgboost import XGBRFClassifier
 
				 from sklearn.svm import SVC
			
 
				 from sklearn.linear_model import LogisticRegression
			
 
				 from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				 from hyperopt import hp
			
 
				 
			
 
				-from cdplib.hyperopt.SpaceComposer import SpaceComposer
			
 
				+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
			
 
				 
			
 
				+# TODO: add sample spaces for encoders and transformers
			
 
				 
			
 
				 encoders = []
			
 
				 
			
@@ -26,46 +29,49 @@ selectors = [
 
				     {"name": "kbest",
			
 
				      "object": SelectPercentile(),
			
 
				      "params": {
			
 
				-       "percentile": 3 + hp.randint("kbest__percentile", 200),
			
 
				+       "percentile": 3 + hp.randint("kbest__percentile", 60),
			
 
				        "score_func": hp.choice("kbest__score_func",
			
 
				                                [f_classif, chi2, mutual_info_classif])}},
			
 
				 
			
 
				-#    {"name": "fpr",
			
 
				-#     "object": SelectFpr(),
			
 
				-#     "params": {
			
 
				-#        "score_func": hp.choice("fpr__score_func",
			
 
				-#                                [f_classif, chi2, mutual_info_classif]),
			
 
				-#        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
			
 
				-
			
 
				-#    {"name": "rfe_rf",
			
 
				-#     "object":
			
 
				-#         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
			
 
				-#     "params": {
			
 
				-#         "n_features_to_select":
			
 
				-#             3 + hp.randint("rfe_rf__n_features_to_select", 200),
			
 
				-#         "estimator__n_estimators":
			
 
				-#             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
			
 
				-
			
 
				-#    {"name": "rfm_rf",
			
 
				-#     "object":
			
 
				-#         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
			
 
				-#                                                          random_state=33)),
			
 
				-#     "params": {
			
 
				-#         "estimator__n_estimators":
			
 
				-#             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
			
 
				-
			
 
				-#    {"name": "rfm_lr",
			
 
				-#     "object":
			
 
				-#         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
			
 
				-#                                                      random_state=33)),
			
 
				-#     "params": {
			
 
				-#          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
			
 
				-
			
 
				-#    {"name": "pca",
			
 
				-#     "object": PCA(random_state=33),
			
 
				-#     "params": {
			
 
				-#       "n_components": 3 + hp.randint("pca__n_components", 20)
			
 
				-#       }}
			
 
				+    {"name": "fpr",
			
 
				+     "object": SelectFpr(),
			
 
				+     "params": {
			
 
				+        "score_func": hp.choice("fpr__score_func",
			
 
				+                                [f_classif, chi2]),
			
 
				+        # mutual_info_classif does not work here
			
 
				+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
			
 
				+
			
 
				+    {"name": "rfe_rf",
			
 
				+     "object":
			
 
				+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
			
 
				+     "params": {
			
 
				+         "n_features_to_select":
			
 
				+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
			
 
				+         "estimator__n_estimators":
			
 
				+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
			
 
				+
			
 
				+    {"name": "rfm_rf",
			
 
				+     "object":
			
 
				+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
			
 
				+                                                          random_state=33)),
			
 
				+     "params": {
			
 
				+         "estimator__n_estimators":
			
 
				+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
			
 
				+
			
 
				+    {"name": "rfm_lr",
			
 
				+     "object":
			
 
				+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
			
 
				+                                                      random_state=33)),
			
 
				+     "params": {
			
 
				+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
			
 
				+
			
 
				+    {"name": "std_scaler_pca",
			
 
				+     "object": Pipeline([
			
 
				+             ("scaler", StandardScaler()),
			
 
				+             ("pca", PCA(random_state=33))]),
			
 
				+     "params": {
			
 
				+        "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
			
 
				+       }}
			
 
				     ]
			
 
				 
			
 
				 models = [
			
@@ -77,27 +83,30 @@ models = [
 
				            "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
			
 
				            }},
			
 
				 
			
 
				-#        {"name": "rf",
			
 
				-#         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
			
 
				-#         "params": {
			
 
				-#           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
			
 
				-#           "max_depth": 3 + hp.randint("rf__max_depth", 10),
			
 
				-#           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
			
 
				-#           }},
			
 
				-
			
 
				-#        {"name": "lr",
			
 
				-#         "object": LogisticRegression(n_jobs=-1, random_state=33),
			
 
				-#         "params":  {
			
 
				-#           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
			
 
				-#           "C": hp.uniform("lr__C", 0.1, 1000)}},
			
 
				-
			
 
				-#        {"name": "svc",
			
 
				-#         "object": SVC(random_state=33),
			
 
				-#         "params": {
			
 
				-#            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
			
 
				-#            "degree": 2 + hp.randint("svc__degree", 3),
			
 
				-#            "C": hp.uniform("svc__C", 0.1, 1000)
			
 
				-#            }}
			
 
				+        {"name": "rf",
			
 
				+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
			
 
				+         "params": {
			
 
				+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
			
 
				+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
			
 
				+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
			
 
				+           }},
			
 
				+
			
 
				+        # the default solver does not accept l1 penalty
			
 
				+        {"name": "lr",
			
 
				+         "object": LogisticRegression(n_jobs=-1, random_state=33,
			
 
				+                                      solver='liblinear'),
			
 
				+         "params":  {
			
 
				+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
			
 
				+           "C": hp.uniform("lr__C", 0.1, 1000)}},
			
 
				+
			
 
				+        # svc does not support parallelizaiton, therefore is slow
			
 
				+        {"name": "svc",
			
 
				+         "object": SVC(random_state=33),
			
 
				+         "params": {
			
 
				+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
			
 
				+            "degree": 2 + hp.randint("svc__degree", 3),
			
 
				+            "C": hp.uniform("svc__C", 0.1, 1000)
			
 
				+            }}
			
 
				         ]
			
 
				 
			
 
				 step_list = [encoders, transformers, selectors, models]
			
--- a/cdplib/hyperopt/space_SAMPLE.py
+++ b/cdplib/hyperopt/space_SAMPLE.py
@@ -1,107 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Wed Sep 30 13:58:39 2020
			
 
				-
			
 
				-@author: tanya
			
 
				-@description: a sample space of scikit learn pipelines
			
 
				- to pass to the HyperoptPipelineSelection class
			
 
				-
			
 
				-"""
			
 
				-
			
 
				-from sklearn.ensemble import RandomForestClassifier
			
 
				-from sklearn.feature_selection import SelectFromModel, SelectKBest,\
			
 
				-    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
			
 
				-from xgboost import XGBRFClassifier
			
 
				-from sklearn.svm import SVC
			
 
				-from sklearn.linear_model import LogisticRegression
			
 
				-from sklearn.decomposition import PCA
			
 
				-from hyperopt import hp
			
 
				-
			
 
				-from cdplib.hyperopt.SpaceComposer import space_composer
			
 
				-
			
 
				-encoders = []
			
 
				-
			
 
				-transformers = []
			
 
				-
			
 
				-selectors = [
			
 
				-    {"name": "kbest",
			
 
				-     "object": SelectKBest(),
			
 
				-     "params": {
			
 
				-       "k": 3 + hp.randint("kbest__k", 200),
			
 
				-       "score_func": hp.choice("kbest__score_func",
			
 
				-                               [f_classif, chi2, mutual_info_classif])}},
			
 
				-
			
 
				-    {"name": "fpr",
			
 
				-     "object": SelectFpr(),
			
 
				-     "params": {
			
 
				-        "score_func": hp.choice("fpr__score_func",
			
 
				-                                [f_classif, chi2, mutual_info_classif]),
			
 
				-        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
			
 
				-
			
 
				-    {"name": "rfe_rf",
			
 
				-     "object":
			
 
				-         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
			
 
				-     "params": {
			
 
				-         "n_features_to_select":
			
 
				-             3 + hp.randint("rfe_rf__n_features_to_select", 200),
			
 
				-         "estimator__n_estimators":
			
 
				-             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
			
 
				-
			
 
				-    {"name": "rfm_rf",
			
 
				-     "object":
			
 
				-         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
			
 
				-                                                          random_state=33)),
			
 
				-     "params": {
			
 
				-         "estimator__n_estimators":
			
 
				-             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
			
 
				-
			
 
				-    {"name": "rfm_lr",
			
 
				-     "object":
			
 
				-         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
			
 
				-                                                      random_state=33)),
			
 
				-     "params": {
			
 
				-          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
			
 
				-
			
 
				-    {"name": "pca",
			
 
				-     "object": PCA(random_state=33),
			
 
				-     "params": {
			
 
				-       "n_components": 3 + hp.randint("pca__n_components", 20)
			
 
				-       }}
			
 
				-    ]
			
 
				-
			
 
				-models = [
			
 
				-        {"name": "xgb",
			
 
				-         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
			
 
				-         "params": {
			
 
				-           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
			
 
				-           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
			
 
				-           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
			
 
				-           }},
			
 
				-
			
 
				-        {"name": "rf",
			
 
				-         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
			
 
				-         "params": {
			
 
				-           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
			
 
				-           "max_depth": 3 + hp.randint("rf__max_depth", 10),
			
 
				-           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
			
 
				-           }},
			
 
				-
			
 
				-        {"name": "lr",
			
 
				-         "object": LogisticRegression(n_jobs=-1, random_state=33),
			
 
				-         "params":  {
			
 
				-           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
			
 
				-           "C": hp.uniform("lr__C", 0.1, 1000)}},
			
 
				-
			
 
				-        {"name": "svc",
			
 
				-         "object": SVC(random_state=33),
			
 
				-         "params": {
			
 
				-            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
			
 
				-            "degree": 2 + hp.randint("svc__degree", 3),
			
 
				-            "C": hp.uniform("svc__C", 0.1, 1000)
			
 
				-            }}
			
 
				-        ]
			
 
				-
			
 
				-step_list = [encoders, transformers, selectors, models]
			
 
				-
			
 
				-space = space_composer(step_list)
			
--- a/cdplib/hyperopt/space_sample_advanced.py
+++ b/cdplib/hyperopt/space_sample_advanced.py
@@ -1,105 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Mon Jul  6 14:02:24 2020
			
 
				-
			
 
				-@author: tanya
			
 
				-@description: space object to pass to HyperoptPipelineSelection class
			
 
				-"""
			
 
				-from sklearn.ensemble import RandomForestClassifier
			
 
				-from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
			
 
				-    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
			
 
				-from xgboost import XGBRFClassifier
			
 
				-from sklearn.svm import SVC
			
 
				-from sklearn.linear_model import LogisticRegression
			
 
				-from sklearn.decomposition import PCA
			
 
				-from hyperopt import hp
			
 
				-
			
 
				-from cdplib.hyperopt.SpaceComposer import space_composer
			
 
				-
			
 
				-
			
 
				-encoders = []
			
 
				-
			
 
				-transformers = []
			
 
				-
			
 
				-selectors = [
			
 
				-    {"name": "kbest",
			
 
				-     "object": SelectPercentile(),
			
 
				-     "params": {
			
 
				-       "k": 3 + hp.randint("kbest__percentile", 200),
			
 
				-       "score_func": hp.choice("kbest__score_func",
			
 
				-                               [f_classif, chi2, mutual_info_classif])}},
			
 
				-
			
 
				-    {"name": "fpr",
			
 
				-     "object": SelectFpr(),
			
 
				-     "params": {
			
 
				-        "score_func": hp.choice("fpr__score_func",
			
 
				-                                [f_classif, chi2, mutual_info_classif]),
			
 
				-        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
			
 
				-
			
 
				-    {"name": "rfe_rf",
			
 
				-     "object":
			
 
				-         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
			
 
				-     "params": {
			
 
				-         "n_features_to_select":
			
 
				-             3 + hp.randint("rfe_rf__n_features_to_select", 200),
			
 
				-         "estimator__n_estimators":
			
 
				-             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
			
 
				-
			
 
				-    {"name": "rfm_rf",
			
 
				-     "object":
			
 
				-         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
			
 
				-                                                          random_state=33)),
			
 
				-     "params": {
			
 
				-         "estimator__n_estimators":
			
 
				-             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
			
 
				-
			
 
				-    {"name": "rfm_lr",
			
 
				-     "object":
			
 
				-         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
			
 
				-                                                      random_state=33)),
			
 
				-     "params": {
			
 
				-          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
			
 
				-
			
 
				-    {"name": "pca",
			
 
				-     "object": PCA(random_state=33),
			
 
				-     "params": {
			
 
				-       "n_components": 3 + hp.randint("pca__n_components", 20)
			
 
				-       }}
			
 
				-    ]
			
 
				-
			
 
				-models = [
			
 
				-        {"name": "xgb",
			
 
				-         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
			
 
				-         "params": {
			
 
				-           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
			
 
				-           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
			
 
				-           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
			
 
				-           }},
			
 
				-
			
 
				-        {"name": "rf",
			
 
				-         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
			
 
				-         "params": {
			
 
				-           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
			
 
				-           "max_depth": 3 + hp.randint("rf__max_depth", 10),
			
 
				-           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
			
 
				-           }},
			
 
				-
			
 
				-        {"name": "lr",
			
 
				-         "object": LogisticRegression(n_jobs=-1, random_state=33),
			
 
				-         "params":  {
			
 
				-           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
			
 
				-           "C": hp.uniform("lr__C", 0.1, 1000)}},
			
 
				-
			
 
				-        {"name": "svc",
			
 
				-         "object": SVC(random_state=33),
			
 
				-         "params": {
			
 
				-            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
			
 
				-            "degree": 2 + hp.randint("svc__degree", 3),
			
 
				-            "C": hp.uniform("svc__C", 0.1, 1000)
			
 
				-            }}
			
 
				-        ]
			
 
				-
			
 
				-step_list = [encoders, transformers, selectors, models]
			
 
				-
			
 
				-space = space_composer(step_list)
			
--- a/cdplib/pipeline_selector/PipelineSelector.py
+++ b/cdplib/pipeline_selector/PipelineSelector.py
@@ -134,6 +134,11 @@ class PipelineSelector(ABC):
 
				 
			
 
				         ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
			
 
				 
			
 
				+        self.attached_space = False
			
 
				+        self.attached_data = False
			
 
				+        self.configured_cross_validation = False
			
 
				+        self.configured_summary_saving = False
			
 
				+
			
 
				         self._cost_func = cost_func
			
 
				         # score factor is 1 when cost_func is minimized,
			
 
				         # -1 when cost func is maximized
			
@@ -183,11 +188,6 @@ class PipelineSelector(ABC):
 
				             self._trials = None
			
 
				             self._start_iteration = 0
			
 
				 
			
 
				-        self.attached_space = False
			
 
				-        self.attached_data = False
			
 
				-        self.configured_cross_validation = False
			
 
				-        self.configured_summary_saving = False
			
 
				-
			
 
				         # keeping track of the current search iteration
			
 
				         self._iteration = self._start_iteration
			
 
				         self._score_improved = False