4 years ago · 470408f2db
--- a/cdplib/gridsearch/GridSearchPipelineSelector.py
+++ b/cdplib/gridsearch/GridSearchPipelineSelector.py
@@ -0,0 +1,348 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 14:15:17 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description:a class for selecting a machine learning
			
 
				+ pipeline from a deterministic space of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is though in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far as well as the entire tuning history
			
 
				+ if needed.
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from itertools import product
			
 
				+from collections import ChainMap
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+
			
 
				+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class GridSearchPipelineSelector(PipelineSelector):
			
 
				+    """
			
 
				+    A class for selecting a machine learning
			
 
				+     pipeline from a deterministic space of parameter distributions
			
 
				+     over multiple pipelines.
			
 
				+     The selection is though in such a way that a Trials object is being
			
 
				+     maintained during the tuning process from which one can retrieve
			
 
				+     the best pipeline so far as well as the entire tuning history
			
 
				+     if needed.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func,
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: int = 1,
			
 
				+                 cross_val_averaging_func: callable = None,
			
 
				+                 additional_metrics: dict = None,
			
 
				+                 strategy_name: str = None,
			
 
				+                 stdout_log_level: str = "INFO"
			
 
				+                 ):
			
 
				+        """
			
 
				+        :param callable cost_func: function to minimize or maximize
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from
			
 
				+            the beginning.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+
			
 
				+        :param str log_path: Optional, when not provided logs to stdout.
			
 
				+
			
 
				+        :param callable averaging_func: optional,
			
 
				+            when not provided set to mean. Function
			
 
				+            to aggregate the cross-validated values of the cost function.
			
 
				+            Classic situation is to take the mean,
			
 
				+            another example is, for example mean() - c*var().
			
 
				+        :param additional_metics: dict of additional metrics to save
			
 
				+            of the form {"metric_name": metric} where metric is a Callable.
			
 
				+
			
 
				+        :param str strategy_name: a name might be asigned to the trials,
			
 
				+            a strategy is defined by the data set, cv object, cost function.
			
 
				+            When the strategy changes, one should start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+        super().__init__(cost_func=cost_func,
			
 
				+                         greater_is_better=greater_is_better,
			
 
				+                         trials_path=trials_path,
			
 
				+                         backup_trials_freq=backup_trials_freq,
			
 
				+                         cross_val_averaging_func=cross_val_averaging_func,
			
 
				+                         additional_metrics=additional_metrics,
			
 
				+                         strategy_name=strategy_name,
			
 
				+                         stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+        self._trials = self._trials or []
			
 
				+
			
 
				+    def run_trials(self):
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self.attached_space)
			
 
				+        except AssertionError:
			
 
				+            err = "Parameter distribution space must be attached"
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+        done_trial_ids = [{"name": trial["name"],
			
 
				+                           "params": trial["params"],
			
 
				+                           "status": trial["status"]}
			
 
				+                          for trial in self._trials]
			
 
				+
			
 
				+        # list (generator) of (flattened) dictionaries
			
 
				+        # with all different combinations of
			
 
				+        # parameters for different pipelines
			
 
				+        # from the space definition.
			
 
				+        space_unfolded = ({"name": pipeline_dist["name"],
			
 
				+                           "pipeline": pipeline_dist["pipeline"],
			
 
				+                           "params": param_set}
			
 
				+                          for pipeline_dist in self._space
			
 
				+                          for param_set in
			
 
				+                          (dict(ChainMap(*tup)) for tup in
			
 
				+                           product(*[[{k: v} for v in
			
 
				+                                      pipeline_dist["params"][k]]
			
 
				+                                     for k in pipeline_dist["params"]])))
			
 
				+
			
 
				+        for space_element in space_unfolded:
			
 
				+
			
 
				+            trial_id = {"name": space_element["name"],
			
 
				+                        "params": space_element["params"],
			
 
				+                        "status": 'ok'}
			
 
				+
			
 
				+            if trial_id in done_trial_ids:
			
 
				+                continue
			
 
				+
			
 
				+            result = self._objective(space_element)
			
 
				+
			
 
				+            pipeline = space_element["pipeline"].set_params(
			
 
				+                    **space_element["params"])
			
 
				+
			
 
				+            trial = {"name": space_element["name"],
			
 
				+                     "params": space_element["params"],
			
 
				+                     "pipeline": pipeline}
			
 
				+
			
 
				+            trial.update(result)
			
 
				+
			
 
				+            self._trials.append(trial)
			
 
				+
			
 
				+        self._backup_trials()
			
 
				+
			
 
				+        self.finished_tuning = True
			
 
				+
			
 
				+    @property
			
 
				+    def number_of_trials(self) -> int:
			
 
				+        """
			
 
				+        Number of trials already run in the current trials object
			
 
				+        """
			
 
				+        if self._trials is None:
			
 
				+            return 0
			
 
				+        else:
			
 
				+            return len(self._trials)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial(self) -> dict:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self._trials is not None)
			
 
				+        except AssertionError:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Call run_trials method.")
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+        try:
			
 
				+            return max(self._trials, key=lambda x: x["score"])
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(self._trials is not None)
			
 
				+        except AssertionError:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Call run_trials method.")
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+        try:
			
 
				+            return self.best_trial["score"]
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(self._trials is not None)
			
 
				+        except AssertionError:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Call run_trials method.")
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+        try:
			
 
				+            return self.best_trial["score_variance"]
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(self._trials is not None)
			
 
				+        except AssertionError:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Call run_trials method.")
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+        try:
			
 
				+            return self.best_trial["pipeline"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int) -> list:
			
 
				+        """
			
 
				+        N best pipelines with corresponding
			
 
				+        best hyperparameters
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(isinstance(n, int))
			
 
				+        except AssertionError:
			
 
				+            err = "Parameter n must be an int"
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+        try:
			
 
				+            assert(self._trials is not None)
			
 
				+        except AssertionError:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Call run_trials method.")
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+        try:
			
 
				+            return [trial["pipeline"] for trial in
			
 
				+                    sorted(self._trials, key=lambda x: x["score"],
			
 
				+                           reverse=True)[:n]]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best trials. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> list:
			
 
				+        """
			
 
				+        If the hyperparameter search is done over multiple
			
 
				+        pipelines, then returns n different pipeline-types
			
 
				+        with corresponding hyperparameters
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self._trials is not None)
			
 
				+        except AssertionError:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Call run_trials method.")
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+        try:
			
 
				+            return pd.DataFrame(self._trials)\
			
 
				+                     .sort_values(by=["name", "score"],
			
 
				+                                  ascending=False)\
			
 
				+                     .groupby("name")\
			
 
				+                     .head(n)[["pipeline"]]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best trials of each type."
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def trials_to_excel(self, path: str):
			
 
				+        """
			
 
				+        Trials object in the shape of table written to excel,
			
 
				+        should contain the run number, pipeline (as str),
			
 
				+        hyperparamters (as str), self.best_result (see self._objective method)
			
 
				+        as well as additional information configured
			
 
				+        through self.save_result method.
			
 
				+        """
			
 
				+        pd.DataFrame(self._trials).to_excel(path)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    # elementary example
			
 
				+
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from sklearn.metrics import accuracy_score, precision_score
			
 
				+    from cdplib.gridsearch.space_sample import space
			
 
				+    from cdplib.log import Log
			
 
				+    from cdplib.db_handlers import MongodbHandler
			
 
				+    import pickle
			
 
				+    import pandas as pd
			
 
				+    import os
			
 
				+
			
 
				+    trials_path = "gridsearch_trials_TEST.pkl"
			
 
				+    additional_metrics = {"precision": precision_score}
			
 
				+    strategy_name = "strategy_1"
			
 
				+    data_path = "data_TEST.h5"
			
 
				+    cv_path = "cv_TEST.pkl"
			
 
				+    collection_name = 'TEST_' + strategy_name
			
 
				+
			
 
				+    logger = Log("GridSearchPipelineSelector__TEST:")
			
 
				+
			
 
				+    logger.info("Start test")
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
			
 
				+    pd.Series(y).to_hdf(data_path, key="y_train")
			
 
				+
			
 
				+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
			
 
				+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
			
 
				+
			
 
				+    pickle.dump(cv, open(cv_path, "wb"))
			
 
				+
			
 
				+    gs = GridSearchPipelineSelector(cost_func=accuracy_score,
			
 
				+                                    greater_is_better=True,
			
 
				+                                    trials_path=trials_path,
			
 
				+                                    additional_metrics=additional_metrics,
			
 
				+                                    strategy_name=strategy_name,
			
 
				+                                    stdout_log_level="WARNING")
			
 
				+
			
 
				+    gs.attach_space(space=space)
			
 
				+
			
 
				+    gs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
			
 
				+                             cv_pickle_path=cv_path)
			
 
				+
			
 
				+    save_method = MongodbHandler().insert_data_into_collection
			
 
				+    save_kwargs = {'collection_name': collection_name}
			
 
				+
			
 
				+    gs.configer_summary_saving(save_method=save_method,
			
 
				+                               kwargs=save_kwargs)
			
 
				+
			
 
				+    gs.run_trials()
			
 
				+
			
 
				+    logger.info("Best trial: {}".format(gs.best_trial))
			
 
				+
			
 
				+    for file in [trials_path, data_path, cv_path]:
			
 
				+        os.remove(file)
			
 
				+
			
 
				+    logger.info("End test")
			
--- a/cdplib/gridsearch/gridsearch.py
+++ b/cdplib/gridsearch/gridsearch.py
@@ -1,195 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Wed Sep 30 14:15:17 2020
			
 
				-
			
 
				-@author: tanya
			
 
				-"""
			
 
				-
			
 
				-import os
			
 
				-import sys
			
 
				-from itertools import product
			
 
				-from collections import ChainMap
			
 
				-from sklearn.pipeline import Pipeline
			
 
				-from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
			
 
				-
			
 
				-sys.path.append(os.getcwd())
			
 
				-
			
 
				-
			
 
				-class GridSearchPipelineSelector(PipelineSelector):
			
 
				-    """
			
 
				-    """
			
 
				-    def __init__(self,
			
 
				-                 cost_func,
			
 
				-                 greater_is_better: bool,
			
 
				-                 trials_path: str,
			
 
				-                 backup_trials_freq: int = 1,
			
 
				-                 cross_val_averaging_func: callable = None):
			
 
				-        '''
			
 
				-        :param callable cost_func: function to minimize or maximize
			
 
				-
			
 
				-        :param bool greater_is_better: when True
			
 
				-            cost_func is maximized, else minimized.
			
 
				-
			
 
				-        :param str trials_path: path at which the trials object is saved
			
 
				-            in binary format. From the trials object we can
			
 
				-            select information about the obtained scores, score variations,
			
 
				-            and pipelines, and parameters tried out so far. If a trials object
			
 
				-            already exists at the given path, it is loaded and the
			
 
				-            search is continued, else, the search is started from
			
 
				-            the beginning.
			
 
				-
			
 
				-        :param backup_trials_freq: frequecy in interations (trials)
			
 
				-            of saving the trials object at the trials_path.
			
 
				-
			
 
				-        :param str log_path: Optional, when not provided logs to stdout.
			
 
				-
			
 
				-        :param callable averaging_func: optional,
			
 
				-            when not provided set to mean. Function
			
 
				-            to aggregate the cross-validated values of the cost function.
			
 
				-            Classic situation is to take the mean,
			
 
				-            another example is, for example mean() - c*var().
			
 
				-        '''
			
 
				-        super().__init__(cost_func=cost_func,
			
 
				-                         greater_is_better=greater_is_better,
			
 
				-                         trials_path=trials_path,
			
 
				-                         backup_trials_freq=backup_trials_freq,
			
 
				-                         cross_val_averaging_func=cross_val_averaging_func)
			
 
				-
			
 
				-    def run_trials(self):
			
 
				-        """
			
 
				-        """
			
 
				-        if not self._space_attached:
			
 
				-            err = "Parameter distribution space must be attached"
			
 
				-            self._logger.log_and_raise_error(err)
			
 
				-
			
 
				-        self._trials = self._trials or []
			
 
				-
			
 
				-        done_trial_ids = [{"name": trial["name"],
			
 
				-                           "params": trial["params"]}
			
 
				-                          for trial in self._trials]
			
 
				-
			
 
				-        # list (generator) of (flattened) dictionaries
			
 
				-        # with all different combinations of
			
 
				-        # parameters for different pipelines
			
 
				-        # from the space definition.
			
 
				-        space_unfolded = ({"name": pipeline_dist["name"],
			
 
				-                           "pipeline": pipeline_dist["pipeline"],
			
 
				-                           "params": param_set}
			
 
				-                          for pipeline_dist in self._space
			
 
				-                          for param_set in
			
 
				-                          (dict(ChainMap(*tup)) for tup in
			
 
				-                           product(*[[{k: v} for v in
			
 
				-                                      pipeline_dist["params"][k]]
			
 
				-                                     for k in pipeline_dist["params"]])))
			
 
				-
			
 
				-        for space_element in space_unfolded:
			
 
				-
			
 
				-            trial_id = {"name": space_element["name"],
			
 
				-                        "params": space_element["params"]}
			
 
				-
			
 
				-            if trial_id in done_trial_ids:
			
 
				-                continue
			
 
				-
			
 
				-            result = self._objective(space_element)
			
 
				-
			
 
				-            pipeline = space_element["pipeline"].set_params(
			
 
				-                    **space_element["params"])
			
 
				-
			
 
				-            self._trials.append({"name": space_element["name"],
			
 
				-                                 "params": space_element["params"],
			
 
				-                                 "pipeline": pipeline,
			
 
				-                                 "result": result})
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial(self) -> dict:
			
 
				-        """
			
 
				-        """
			
 
				-        if self._trials is None:
			
 
				-            self._logger.error(("Trials object is empty. "
			
 
				-                                "Call run_trials method."))
			
 
				-        else:
			
 
				-            try:
			
 
				-                return max(gs._trials, key=lambda x: x["result"]["score"])
			
 
				-            except Exception as e:
			
 
				-                err = ("Could not retrieve the best trial. "
			
 
				-                       "Exit with error: {}".format(e))
			
 
				-                self._logger.log_and_raise_error(err)
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial_score(self) -> float:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if self._trials is None:
			
 
				-            self._logger.error(("Trials object is empty. "
			
 
				-                                "Call run_trials method."))
			
 
				-        else:
			
 
				-            try:
			
 
				-                return self.best_trial["result"]["score"]
			
 
				-            except Exception as e:
			
 
				-                err = ("Could not retrieve the best trial. "
			
 
				-                       "Exit with error: {}".format(e))
			
 
				-                self._logger.log_and_raise_error(err)
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial_score_variance(self) -> float:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if self._trials is None:
			
 
				-            self._logger.error(("Trials object is empty. "
			
 
				-                                "Call run_trials method."))
			
 
				-        else:
			
 
				-            try:
			
 
				-                return self.best_trial["result"]["score_variance"]
			
 
				-            except Exception as e:
			
 
				-                err = ("Could not retrieve the best trial. "
			
 
				-                       "Exit with error: {}".format(e))
			
 
				-                self._logger.log_and_raise_error(err)
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial_pipeline(self) -> Pipeline:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if self._trials is None:
			
 
				-            self._logger.error(("Trials object is empty. "
			
 
				-                                "Call run_trials method."))
			
 
				-        else:
			
 
				-            try:
			
 
				-                return self.best_trial["pipeline"]
			
 
				-            except Exception as e:
			
 
				-                err = ("Could not retrieve the best trial. "
			
 
				-                       "Exit with error: {}".format(e))
			
 
				-                self._logger.log_and_raise_error(err)
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-
			
 
				-    # Runn through test/example
			
 
				-
			
 
				-    from sklearn.datasets import load_breast_cancer
			
 
				-    from sklearn.metrics import accuracy_score
			
 
				-    from cdplib.gridsearch.space_sample import space
			
 
				-    from cdplib.log import Log
			
 
				-
			
 
				-    logger = Log("GridSearchPipelineSelector__TEST:")
			
 
				-
			
 
				-    logger.info("Start test")
			
 
				-
			
 
				-    data_loader = load_breast_cancer()
			
 
				-
			
 
				-    X = data_loader["data"]
			
 
				-    y = data_loader["target"]
			
 
				-
			
 
				-    gs = GridSearchPipelineSelector(cost_func=accuracy_score,
			
 
				-                                    greater_is_better=True,
			
 
				-                                    trials_path="trials_TEST.pkl")
			
 
				-
			
 
				-    gs.attach_space(space=space)
			
 
				-    gs.attach_data(X_train=X, y_train=y, cv=5)
			
 
				-
			
 
				-    gs.run_trials()
			
 
				-
			
 
				-    logger.info("Best trial: {}".format(gs.best_trial))
			
 
				-    logger.info("Best trial pipeline: {}".format(gs.best_trial_pipeline))
			
 
				-
			
 
				-    logger.info("End test")
			
--- a/cdplib/gridsearch/space_sample.py
+++ b/cdplib/gridsearch/space_sample.py
@@ -7,7 +7,7 @@ Created on Mon Oct  5 09:50:24 2020
 
				 """
			
 
				 
			
 
				 from sklearn.ensemble import RandomForestClassifier
			
 
				-from sklearn.feature_selection import SelectKBest
			
 
				+from sklearn.feature_selection import SelectPercentile
			
 
				 from sklearn.linear_model import LogisticRegression
			
 
				 from sklearn.decomposition import PCA
			
 
				 from sklearn.pipeline import Pipeline
			
@@ -18,9 +18,9 @@ space = [
 
				         {"name": "std_scaler_kbest_rf",
			
 
				          "pipeline": Pipeline([
			
 
				                  ("std_scaler", StandardScaler()),
			
 
				-                 ("kbest", SelectKBest()),
			
 
				+                 ("kbest", SelectPercentile()),
			
 
				                  ("rf", RandomForestClassifier())]),
			
 
				-         "params": {"kbest__k": [2, 3],
			
 
				+         "params": {"kbest__percentile": [2, 3],
			
 
				                     "rf__n_estimators": [10, 20]}},
			
 
				 
			
 
				         {"name": "std_scaler_pca_lr",