Просмотр исходного кода

added a sample space to pass to the hyperopt pipline selector

tanja лет назад: 3
Родитель
Сommit
232d4f35cf
2 измененных файлов с 154 добавлено и 0 удалено
  1. 47 0
      cdplib/hyperopt/SpaceComposer.py
  2. 107 0
      cdplib/hyperopt/space_SAMPLE.py

+ 47 - 0
cdplib/hyperopt/SpaceComposer.py

@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 13:54:04 2020
+
+@author: tanya
+@description: a function that from a given list of pipeline steps
+ composes a space to be passed in the HyperoptPipelineSelection class.
+ A classic list of steps would be: [encoders, transformers, selectors, models]
+"""
+from sklearn.pipeline import Pipeline
+from hyperopt import hp
+from itertools import product
+
+
+def space_composer(step_list: list) -> hp.choice:
+    """
+    :param step_list: list of pipeline steps
+     of the form [encoders, transformers, selectors, models]
+     each element of step_list is a list of dictionaries
+     of the form {"name": NAME, "object": OBJECT, "params": PARAMS}
+    :return: hp.choice object of pipelines to choose from
+     when passed to the HyperoptPipelineSelection class
+    """
+
+    pipelines = []
+
+    step_combinations = product(*[step for step in
+                                  step_list if len(step) > 0])
+
+    for step_combination in step_combinations:
+
+        pipeline_dist = {}
+
+        pipeline_dist["name"] = "_".join([step["name"]
+                                          for step in step_combination])
+        pipeline_dist["pipeline"] = Pipeline([(step["name"], step["object"])
+                                              for step in step_combination]),
+
+        pipeline_dist["params"] = {step["name"] + "__" + param_name: param_dist
+                                   for step in step_combination
+                                   for param_name, param_dist
+                                   in step["params"].items()}
+
+        pipelines.append(pipeline_dist)
+
+    return hp.choice("pipelines", pipelines)

+ 107 - 0
cdplib/hyperopt/space_SAMPLE.py

@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 13:58:39 2020
+
+@author: tanya
+@description: a sample space of scikit learn pipelines
+ to pass to the HyperoptPipelineSelection class
+
+"""
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectFromModel, SelectKBest,\
+    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
+from xgboost import XGBRFClassifier
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from hyperopt import hp
+
+from cdplib.hyperopt.SpaceComposer import space_composer
+
+encoders = []
+
+transformers = []
+
+selectors = [
+    {"name": "kbest",
+     "object": SelectKBest(),
+     "params": {
+       "k": 3 + hp.randint("kbest__k", 200),
+       "score_func": hp.choice("kbest__score_func",
+                               [f_classif, chi2, mutual_info_classif])}},
+
+    {"name": "fpr",
+     "object": SelectFpr(),
+     "params": {
+        "score_func": hp.choice("fpr__score_func",
+                                [f_classif, chi2, mutual_info_classif]),
+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
+
+    {"name": "rfe_rf",
+     "object":
+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
+     "params": {
+         "n_features_to_select":
+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
+         "estimator__n_estimators":
+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_rf",
+     "object":
+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
+                                                          random_state=33)),
+     "params": {
+         "estimator__n_estimators":
+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_lr",
+     "object":
+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
+                                                      random_state=33)),
+     "params": {
+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
+
+    {"name": "pca",
+     "object": PCA(random_state=33),
+     "params": {
+       "n_components": 3 + hp.randint("pca__n_components", 20)
+       }}
+    ]
+
+models = [
+        {"name": "xgb",
+         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
+           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
+           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
+           }},
+
+        {"name": "rf",
+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
+           }},
+
+        {"name": "lr",
+         "object": LogisticRegression(n_jobs=-1, random_state=33),
+         "params":  {
+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
+           "C": hp.uniform("lr__C", 0.1, 1000)}},
+
+        {"name": "svc",
+         "object": SVC(random_state=33),
+         "params": {
+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
+            "degree": 2 + hp.randint("svc__degree", 3),
+            "C": hp.uniform("svc__C", 0.1, 1000)
+            }}
+        ]
+
+step_list = [encoders, transformers, selectors, models]
+
+space = space_composer(step_list)