%!s(int64=4) %!d(string=hai) anos · 28f4bcc46d
--- a/cdplib/pipeline_selector/PipelineSelector.py
+++ b/cdplib/pipeline_selector/PipelineSelector.py
@@ -5,11 +5,15 @@ Created on Wed Sep 30 14:23:23 2020
 
																 @author: tanya
															
 
																 @description: an abstract class for selecting a machine learning
															
 
																- pipeline in a space of parameter distributions over multiple pipelines.
															
 
																- The selection is though in such a way that a Trials object is being
															
 
																+ pipeline from a space (deterministic or random) of parameter distributions
															
 
																+ over multiple pipelines.
															
 
																+ The selection is thought in such a way that a Trials object is being
															
 
																  maintained during the tuning process from which one can retrieve
															
 
																- the best pipeline so far as well as the entire tuning history
															
 
																- if needed.
															
 
																+ the best pipeline so far
															
 
																+ as well as the entire tuning history if needed.
															
 
																+ Methods configure_cross_validation and configure_result_saving
															
 
																+ allow to use a custom cross-validation method and
															
 
																+ save the current best result in a file or database during training.
															
 
																  Children classes: hyperopt and custom gridsearch.
															
 
																 """
															
@@ -18,12 +22,13 @@ import os
 
																 import sys
															
 
																 import time
															
 
																 import datetime
															
 
																-from typing import Callable
															
 
																 import numpy as np
															
 
																 import pandas as pd
															
 
																 from abc import ABC, abstractmethod, abstractproperty
															
 
																+from typing import Callable
															
 
																+import functools
															
 
																 from sklearn.pipeline import Pipeline
															
 
																-from sklearn.model_selection import cross_validate as sklearn_cross_validator
															
 
																+from sklearn.model_selection import cross_validate as sklearn_cross_validation
															
 
																 from sklearn.metrics import make_scorer
															
 
																 from hyperopt import STATUS_OK, STATUS_FAIL
															
 
																 from cdplib.log import Log
															
@@ -36,20 +41,28 @@ sys.path.append(os.getcwd())
 
																 class PipelineSelector(ABC):
															
 
																     """
															
 
																     An abstract class for selecting a machine learning
															
 
																-    pipeline in a space of parameter distributions over multiple pipelines.
															
 
																+    pipeline from a space (deterministic or random) of parameter
															
 
																+    distributions over multiple pipelines.
															
 
																     The selection is though in such a way that a Trials object is being
															
 
																     maintained during the tuning process from which one can retrieve
															
 
																     the best pipeline so far as well as the entire tuning history
															
 
																     if needed.
															
 
																+    Methods configure_cross_validation and configure_result_saving
															
 
																+    allow to use a custom cross-validation method and
															
 
																+    save the current best result in a file or database during training.
															
 
																+    Children classes: hyperopt and custom gridsearch.
															
 
																     """
															
 
																     def __init__(self,
															
 
																-                 cost_func,
															
 
																+                 cost_func: (Callable, str),
															
 
																                  greater_is_better: bool,
															
 
																                  trials_path: str,
															
 
																-                 backup_trials_freq: int = 1,
															
 
																-                 cross_val_averaging_func: callable = None):
															
 
																-        '''
															
 
																-        :param callable cost_func: function to minimize or maximize
															
 
																+                 backup_trials_freq: int = None,
															
 
																+                 cross_val_averaging_func: Callable = None,
															
 
																+                 additional_metrics: dict = None,
															
 
																+                 strategy_name: str = None,
															
 
																+                 stdout_log_level: str = "INFO"):
															
 
																+        """
															
 
																+        :param Callable cost_func: function to minimize or maximize
															
 
																         :param bool greater_is_better: when True
															
 
																             cost_func is maximized, else minimized.
															
@@ -64,67 +77,98 @@ class PipelineSelector(ABC):
 
																         :param backup_trials_freq: frequecy in interations (trials)
															
 
																             of saving the trials object at the trials_path.
															
 
																+            if None, the trials object is backed up avery time
															
 
																+            the score improves.
															
 
																         :param str log_path: Optional, when not provided logs to stdout.
															
 
																-        :param callable cross_val_averaging_func: optional,
															
 
																+        :param Callable cross_val_averaging_func: optional,
															
 
																             when not provided set to mean. Function
															
 
																             to aggregate the cross-validated values of the cost function.
															
 
																             Classic situation is to take the mean,
															
 
																             another example is, for example mean() - c*var().
															
 
																-        '''
															
 
																-        self._logger = Log("PipelineSelector: ")
															
 
																+
															
 
																+        :param additional_metics: dict of additional metrics to save
															
 
																+            of the form {"metric_name": metric} where metric is a Callable.
															
 
																+
															
 
																+        :param str strategy_name: a name might be asigned to the trials,
															
 
																+            a strategy is defined by the data set, cv object, cost function.
															
 
																+            When the strategy changes, one should start with new trials.
															
 
																+
															
 
																+        :param str stdout_log_level: can be INFO, WARNING, ERROR
															
 
																+        """
															
 
																+        self._logger = Log("PipelineSelector: ",
															
 
																+                           stdout_log_level=stdout_log_level)
															
 
																         input_errors = [(cost_func, Callable,
															
 
																-                         "Parameter 'cost_func' must be a callable"),
															
 
																+                         "Parameter 'cost_func' must be a Callable"),
															
 
																                         (greater_is_better, bool,
															
 
																                          "Parameter 'greater_is_better' must be bool type"),
															
 
																                         (trials_path, str,
															
 
																                          "Parameter 'trials_path' must be of string type"),
															
 
																                         (cross_val_averaging_func, (Callable, None.__class__),
															
 
																                          ("Parameter 'cross_val_averaging_func'"
															
 
																-                          "must be a callable")),
															
 
																-                        (backup_trials_freq, int,
															
 
																-                         "Parameter backup_trials_freq must be an int")]
															
 
																+                          "must be a Callable")),
															
 
																+                        (backup_trials_freq, (int, None.__class__),
															
 
																+                         "Parameter backup_trials_freq must be an int"),
															
 
																+                        (additional_metrics, (dict, None.__class__),
															
 
																+                         "Parameter additional_metrics must be a dict"),
															
 
																+                        (strategy_name, (str, None.__class__),
															
 
																+                         "Parameter strategy_name must be a str"),
															
 
																+                        (stdout_log_level, str,
															
 
																+                         "Parameter stdout_log_level must be a str")]
															
 
																         for p, t, err in input_errors:
															
 
																             try:
															
 
																                 assert(isinstance(p, t))
															
 
																             except AssertionError:
															
 
																-                self._logger.log_and_raise_error(err)
															
 
																+                self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																+
															
 
																+        try:
															
 
																+            assert((additional_metrics is None) or
															
 
																+                   all([isinstance(metric, Callable)
															
 
																+                        for metric in additional_metrics.values()]))
															
 
																+        except AssertionError:
															
 
																+            err = "Metrics in additional_metrics must be Callables"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																         ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
															
 
																         self._cost_func = cost_func
															
 
																-        # is 1 when cost_func is minimized, -1 when cost func is maximized
															
 
																+        # score factor is 1 when cost_func is minimized,
															
 
																+        # -1 when cost func is maximized
															
 
																         self._score_factor = (not greater_is_better) - greater_is_better
															
 
																-        self._trials_path = trials_path
															
 
																+        self.trials_path = trials_path
															
 
																         self._backup_trials_freq = backup_trials_freq
															
 
																         self._cross_val_averaging_func = cross_val_averaging_func or np.mean
															
 
																-        # keeping track of the current search iteration
															
 
																-        self._run_number = 0
															
 
																-        # space and data need to be attached to perform search.
															
 
																-        self._space_attached = False
															
 
																-        self._data_attached = False
															
 
																-        self._cross_validator_attached = False
															
 
																-        # _best_score is the same as best_trial_score property
															
 
																-        # but is defined in order not to go through all the trials
															
 
																-        # at each iteration.
															
 
																-        self._best_score = np.nan
															
 
																+        self._additional_metrics = additional_metrics or {}
															
 
																+        self._strategy_name = strategy_name
															
 
																+        self._data_path = None
															
 
																+        self._cv_path = None
															
 
																+
															
 
																+        # best_score can be also read from trials
															
 
																+        # but is kept explicitely in order not to
															
 
																+        # search through the trials object every time
															
 
																+        # loss is the opposite of score
															
 
																+        self.best_score = np.nan
															
 
																+
															
 
																+        self._cross_validation = sklearn_cross_validation
															
 
																         # if a trials object already exists at the given path,
															
 
																         # it is loaded and the search is continued. Else,
															
 
																         # the search is started from the beginning.
															
 
																-        if os.path.isfile(trials_path):
															
 
																+        if os.path.isfile(self.trials_path):
															
 
																             try:
															
 
																-                with open(trials_path, "rb") as f:
															
 
																+                with open(self.trials_path, "rb") as f:
															
 
																                     self._trials = pickle.load(f)
															
 
																-                self._best_score = self.best_trial_score
															
 
																+                self._start_iteration = self.number_of_trials
															
 
																+
															
 
																+                self.best_score = self.best_trial_score
															
 
																                 self._logger.info(("Loaded an existing trials object"
															
 
																                                    "Consisting of {} trials")
															
 
																-                                  .format(len(self._trials.trials)))
															
 
																+                                  .format(self._start_iteration))
															
 
																             except Exception as e:
															
 
																                 err = ("Trials object could not be loaded. "
															
@@ -137,75 +181,142 @@ class PipelineSelector(ABC):
 
																                                   "Starting from scratch."))
															
 
																             self._trials = None
															
 
																+            self._start_iteration = 0
															
 
																+
															
 
																+        self.attached_space = False
															
 
																+        self.attached_data = False
															
 
																+        self.configured_cross_validation = False
															
 
																+        self.configured_summary_saving = False
															
 
																+
															
 
																+        # keeping track of the current search iteration
															
 
																+        self._iteration = self._start_iteration
															
 
																+        self._score_improved = False
															
 
																+
															
 
																+        self.start_tuning_time = datetime.datetime.today()
															
 
																+        self.end_tuning_time = None
															
 
																+        self.finished_tuning = False
															
 
																     def _backup_trials(self):
															
 
																         '''
															
 
																         Pickles (Saves) the trials object.
															
 
																         Used in a scheduler.
															
 
																         '''
															
 
																-        with open(self._trials_path, "wb") as f:
															
 
																-            pickle.dump(self._trials, f)
															
 
																+        try:
															
 
																+            with open(self.trials_path, "wb") as f:
															
 
																+                pickle.dump(self._trials, f)
															
 
																+        except Exception as e:
															
 
																+            err = "Could not backup trials. Exit with error: {}".format(e)
															
 
																+            self._logger.log_and_raise_error(err)
															
 
																-    def attach_cross_validator(self, cross_validator: Callable = None,
															
 
																-                               module_path: str = None,
															
 
																-                               name: str = None):
															
 
																+    def configure_cross_validation(self,
															
 
																+                                   cross_validation: Callable,
															
 
																+                                   kwargs: dict = None):
															
 
																         """
															
 
																         Method for attaching a custom cross-validation function
															
 
																-        :param cross_validator: a function that has the same
															
 
																+        :param cross_validation: a function that has the same
															
 
																              signature as sklearn.model_selection.cross_validate
															
 
																+        """
															
 
																+        try:
															
 
																+            assert(isinstance(cross_validation, Callable))
															
 
																+        except AssertionError:
															
 
																+            err = "Parameter cross_validation must be a function"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																+
															
 
																+        try:
															
 
																+            kwargs = kwargs or {}
															
 
																+            assert(isinstance(kwargs, dict))
															
 
																+        except AssertionError:
															
 
																+            err = "Paramter kwargs must be a dict"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																+
															
 
																+        try:
															
 
																+            self._cross_validation = functools.partial(
															
 
																+                    self._cross_validation, **kwargs)
															
 
																+
															
 
																+            self.configured_cross_validation = True
															
 
																+
															
 
																+            if hasattr(cross_validation, "__name__"):
															
 
																+                self.best_result["cross_validation"] =\
															
 
																+                    cross_validation.__name__
															
 
																+
															
 
																+            self._logger.info("Configured cross validation")
															
 
																+
															
 
																+        except Exception as e:
															
 
																+            err = ("Failed to configure cross-validation. "
															
 
																+                   "Exit with error: {}".format(e))
															
 
																+            self._logger.log_and_raise_error(err)
															
 
																+
															
 
																+    def configure_cross_validation_from_module(self,
															
 
																+                                               module_path: str,
															
 
																+                                               name: str):
															
 
																+        """
															
 
																         :param str module_path: path to python module
															
 
																-            where the space is defined. Optional when
															
 
																-            the space is provided directly.
															
 
																+            where the cross_validation function is defined.
															
 
																-        :param str name: name of the space loaded from
															
 
																-            a python module. Optional when the space
															
 
																-            is provided directly.
															
 
																+        :param str name: name of the cross validation function
															
 
																+            loaded froma python module.
															
 
																         """
															
 
																         try:
															
 
																-            assert((cross_validator is not None) or
															
 
																-                   ((module_path is not None) and (name is not None)))
															
 
																+            assert(isinstance(module_path, str) and
															
 
																+                   isinstance(name, str))
															
 
																         except AssertionError:
															
 
																-            err = ("Either cross_validator or "
															
 
																-                   "(module_path, name) must be provided")
															
 
																-            self._logger.log_and_raise_error(err)
															
 
																+            err = "Parameters module_path and name must be of str type"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																-        self._cross_validator = cross_validator or\
															
 
																-            LoadingUtils().load_from_module(module_path=module_path, name=name)
															
 
																+        try:
															
 
																+            self._cross_validation = \
															
 
																+                LoadingUtils().load_from_module(
															
 
																+                        module_path=module_path, name=name)
															
 
																-        self._logger.info("Attached a cross validator")
															
 
																-        self._cross_validator_attached = True
															
 
																+            self.configured_cross_validation = True
															
 
																-    def attach_space(self, space=None,
															
 
																-                     module_path: str = None,
															
 
																-                     name: str = None):
															
 
																-        '''
															
 
																+            self.best_result["cross_validation"] = name
															
 
																+
															
 
																+            self._logger.info("Configured cross validation")
															
 
																+
															
 
																+        except Exception as e:
															
 
																+            err = ("Failed to load cross-validation from module. "
															
 
																+                   "Exit with error: {}".format(e))
															
 
																+            self._logger.log_and_raise_error(e)
															
 
																+
															
 
																+    def attach_space(self, space):
															
 
																+        """
															
 
																         :param space: space where
															
 
																-            the search is performed. Optional when a space
															
 
																-            is loaded from a python module. A space might be either
															
 
																+            the search is performed. A space might be either
															
 
																             a list of dictionaries or a hyperopt space object
															
 
																             the elements of which are dictionaries with keys:
															
 
																             name, pipeline, params
															
 
																+        """
															
 
																+        self._space = space
															
 
																+        self._logger.info("Attached parameter distribution space")
															
 
																+        self.attached_space = True
															
 
																+    def attach_space_from_module(self, module_path: str, name: str):
															
 
																+        """
															
 
																         :param str module_path: path to python module
															
 
																-            where the space is defined. Optional when
															
 
																-            the space is provided directly.
															
 
																+            where the space is defined.
															
 
																         :param str name: name of the space loaded from
															
 
																-            a python module. Optional when the space
															
 
																-            is provided directly.
															
 
																-        '''
															
 
																+            a python module.
															
 
																+        """
															
 
																         try:
															
 
																-            assert((space is not None) or
															
 
																-                   ((module_path is not None) and (name is not None)))
															
 
																+            assert(isinstance(module_path, str) and
															
 
																+                   isinstance(name, str))
															
 
																         except AssertionError:
															
 
																-            err = "Either space or (module_path, name) must be provided"
															
 
																-            self._logger.log_and_raise_error(err)
															
 
																+            err = "Parameters module_path and name must be of str type"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																-        self._space = space or LoadingUtils().load_from_module(
															
 
																-                module_path=module_path, name=name)
															
 
																+        try:
															
 
																+            self._space = LoadingUtils().load_from_module(
															
 
																+                    module_path=module_path, name=name)
															
 
																-        self._logger.info("Attached parameter distribution space")
															
 
																-        self._space_attached = True
															
 
																+            self._logger.info("Attached parameter distribution space")
															
 
																+
															
 
																+            self.attached_space = True
															
 
																+        except Exception as e:
															
 
																+            err = ("Failed to attach space from module. "
															
 
																+                   "Exit with error {}".format(e))
															
 
																+            self._logger.loger_and_raise_error(err)
															
 
																     def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
															
 
																                     y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
															
@@ -242,25 +353,28 @@ class PipelineSelector(ABC):
 
																                        isinstance(y_train, (pd.Series, np.ndarray,
															
 
																                                             pd.DataFrame, NoneType)) and
															
 
																                        isinstance(y_val, (pd.Series, np.ndarray)) and
															
 
																-                       ((y_val is None) if (y_train is None)
															
 
																-                        else (y_val is not None)))
															
 
																+                       (y_val is None) == (y_train is None))
															
 
																             except AssertionError:
															
 
																                 self._logger.log_and_raise_error(input_err)
															
 
																-            # cost is evaluated with a cross validation function
															
 
																-            # that accepts an array and a cv object with
															
 
																-            # indices of the fold splits.
															
 
																-            # Here we create a trivial cv object
															
 
																-            # with one validation split.
															
 
																-
															
 
																-            train_inds = list(range(len(X_train)))
															
 
																-            val_inds = list(range(len(X_train),
															
 
																-                                  len(X_train) + len(X_val)))
															
 
																-
															
 
																-            self._cv = [(train_inds, val_inds)]
															
 
																-            self._X = np.concatenate([X_train, X_val])
															
 
																-            self._y = None if y_train is None\
															
 
																-                else np.concatenate([y_train, y_val])
															
 
																+            try:
															
 
																+                # cost is evaluated with a cross validation function
															
 
																+                # that accepts an array and a cv object with
															
 
																+                # indices of the fold splits.
															
 
																+                # Here we create a trivial cv object
															
 
																+                # with one validation split.
															
 
																+
															
 
																+                train_inds = list(range(len(X_train)))
															
 
																+                val_inds = list(range(len(X_train),
															
 
																+                                      len(X_train) + len(X_val)))
															
 
																+
															
 
																+                self._cv = [(train_inds, val_inds)]
															
 
																+                self._X = np.concatenate([X_train, X_val])
															
 
																+                self._y = None if y_train is None\
															
 
																+                    else np.concatenate([y_train, y_val])
															
 
																+            except Exception as e:
															
 
																+                err = "Failed to attach data. Exit with error: {}".format(e)
															
 
																+                self._logger.log_and_raise_error(err)
															
 
																         else:
															
 
																             try:
															
@@ -276,10 +390,139 @@ class PipelineSelector(ABC):
 
																             self._y = y_train
															
 
																         self._logger.info("Attached data")
															
 
																-        self._data_attached = True
															
 
																+        self.attached_data = True
															
 
																-    def _evaluate(self, pipeline: Pipeline) -> dict:
															
 
																-        '''
															
 
																+    def attach_data_from_hdf5(self,
															
 
																+                              data_hdf5_store_path: str,
															
 
																+                              cv_pickle_path: str = None):
															
 
																+        """
															
 
																+        Method for attaching data from a hdf5 store.
															
 
																+             The hdf5 store is a binary file,
															
 
																+             after loading it, it is a dictionary with keys
															
 
																+             X_train (y_train, X_val, y_val). The cv is loaded
															
 
																+             from a pickle file. The reason to separate the data
															
 
																+             store from the cv store, is the hdf5 is optimized to
															
 
																+             store large dataframes (especially with simple types) and
															
 
																+             a a small list of lists like a cv-object is better
															
 
																+             to be stored as a pickle file.
															
 
																+        :param str data_hdf5_store_path: path to the hdf5 store
															
 
																+            with train and validation data
															
 
																+        :param str cv_pickle_path: path to the pickle file with
															
 
																+            the cv data
															
 
																+        """
															
 
																+        try:
															
 
																+            assert(os.path.isfile(data_hdf5_store_path))
															
 
																+        except AssertionError:
															
 
																+            err = "Parameter hdf5_store_path is not a file"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																+
															
 
																+        # load the hdf5 store
															
 
																+        try:
															
 
																+            store = pd.HDFStore(data_hdf5_store_path)
															
 
																+            self._data_path = data_hdf5_store_path
															
 
																+        except Exception as e:
															
 
																+            err = "Could not load the hdf5 store. Exit with error: {}."\
															
 
																+                .format(e)
															
 
																+            self._logger.log_and_raise_error(err)
															
 
																+
															
 
																+        data_input = {}
															
 
																+
															
 
																+        for key in ["/X_train", "/y_train", "/X_val", "/y_val"]:
															
 
																+            if key not in store.keys():
															
 
																+                data_input[key.replace("/", "")] = None
															
 
																+            else:
															
 
																+                data_input[key.replace("/", "")] = store[key]
															
 
																+
															
 
																+        if cv_pickle_path is not None:
															
 
																+            try:
															
 
																+                assert(os.path.isfile(cv_pickle_path))
															
 
																+            except AssertionError:
															
 
																+                err = "Parameter hdf5_store_path is not a file"
															
 
																+                self._logger.log_and_raise_error(err, ErrorType=NameError)
															
 
																+
															
 
																+            try:
															
 
																+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
															
 
																+                self._cv_path = cv_pickle_path
															
 
																+            except Exception as e:
															
 
																+                err = "Could not load the pickeled cv. Exit with error: {}."\
															
 
																+                    .format(e)
															
 
																+                self._logger.log_and_raise_error(err)
															
 
																+        else:
															
 
																+            data_input["cv"] = None
															
 
																+
															
 
																+        self.attach_data(**data_input)
															
 
																+
															
 
																+        store.close()
															
 
																+
															
 
																+    def configer_summary_saving(self,
															
 
																+                                save_method: Callable = None,
															
 
																+                                kwargs: dict = None):
															
 
																+        """
															
 
																+        Attaching a method for saving information about
															
 
																+             the trials/space/strategy and the result of
															
 
																+             the current best pipeline. This method can
															
 
																+             save the result in a txt or a json file,
															
 
																+             or in a database for example. Arguments like
															
 
																+             file path or the table name can be specified in kwargs.
															
 
																+        :param Callable save_method: method for saving the result
															
 
																+            of the pipeline selection. The method must accept
															
 
																+            a pandas DataFrame as argument. See self._save_result
															
 
																+            method for the format of the argument being saved.
															
 
																+            By default, saving to a csv file.
															
 
																+            Examples:
															
 
																+                functools.partial(pd.DataFrame.to_csv,
															
 
																+                                  **{"path_or_buf": <PATH>})
															
 
																+                functools.partial(np.savetxt, **{"fname": <PATH>})
															
 
																+
															
 
																+                functools.partial(SQLHandler(<URI>).append_to_table,
															
 
																+                                  **{"tablename": <NAME>})
															
 
																+
															
 
																+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
															
 
																+                                  **{"collection_name": <NAME>})
															
 
																+
															
 
																+            using functools can be avoided by providing the kwarg argument
															
 
																+        :param dict kwargs: a dictionary with keyword arguments
															
 
																+            (like tablename) to provide to the save_method
															
 
																+        """
															
 
																+        try:
															
 
																+            save_method = save_method or functools.partial(
															
 
																+                    pd.DataFrame.to_excel, **{"path_or_buf": "result.csv"})
															
 
																+
															
 
																+            kwargs = kwargs or {}
															
 
																+
															
 
																+            self._save_method = functools.partial(save_method, **kwargs)
															
 
																+
															
 
																+            self.configured_summary_saving = True
															
 
																+
															
 
																+            self._logger.info("Configured summary saving")
															
 
																+
															
 
																+        except Exception as e:
															
 
																+            err = ("Failed to configure the summary saving. "
															
 
																+                   "Exit with error {}".format(e))
															
 
																+            self._logger.log_and_raise_error(err)
															
 
																+
															
 
																+    def _save_summary(self, summary: dict):
															
 
																+        """
															
 
																+        """
															
 
																+        try:
															
 
																+            assert(self.configured_summary_saving)
															
 
																+        except AssertionError:
															
 
																+            err = "Result saving must be configured first"
															
 
																+            self._logger.log_and_raise_error(err, ErrorType=AssertionError)
															
 
																+
															
 
																+        try:
															
 
																+            self._save_method(summary)
															
 
																+
															
 
																+        except Exception as e:
															
 
																+            err = ("Could not configure summary saving. "
															
 
																+                   "Exit with error: {}".format(e))
															
 
																+
															
 
																+            self._logger.log_and_raise_error(err)
															
 
																+
															
 
																+    def _evaluate(self, pipeline: Pipeline,
															
 
																+                  scoring: Callable = None,
															
 
																+                  cross_validation: Callable = None) -> dict:
															
 
																+        """
															
 
																         This method is called in _objective.
															
 
																         Calculates the cost on the attached data.
															
@@ -289,24 +532,46 @@ class PipelineSelector(ABC):
 
																         :param Pipeline pipeline: machine learning pipeline
															
 
																             that will be evaluated with cross-validation
															
 
																+        :param cross_validation: a function that has the same
															
 
																+             signature as sklearn.model_selection.cross_validate
															
 
																-        :output: dictionary with the aggregated
															
 
																+        :return: dictionary with the aggregated
															
 
																             cross-validation score and
															
 
																             the score variance.
															
 
																-        '''
															
 
																-        if not self._cross_validator_attached:
															
 
																-            self._cross_validator = sklearn_cross_validator
															
 
																+        """
															
 
																+        try:
															
 
																+
															
 
																+            scoring = {"score": make_scorer(self._cost_func)}
															
 
																+
															
 
																+            scoring.update({metric_name: make_scorer(metric)
															
 
																+                            for metric_name, metric
															
 
																+                            in self._additional_metrics.items()})
															
 
																+
															
 
																+            scores = self._cross_validation(
															
 
																+                    estimator=pipeline,
															
 
																+                    X=self._X,
															
 
																+                    y=self._y,
															
 
																+                    cv=self._cv or 5,
															
 
																+                    scoring=scoring,
															
 
																+                    error_score=np.nan)
															
 
																+
															
 
																+            scores_average = {
															
 
																+                    metric_name.replace("test_", ""):
															
 
																+                    self._cross_val_averaging_func(scores[metric_name])
															
 
																+                    for metric_name in scores
															
 
																+                    if metric_name.startswith("test")}
															
 
																+
															
 
																+            scores_variance = {
															
 
																+                    metric_name.replace("test_", "") + "_variance":
															
 
																+                    np.var(scores[metric_name])
															
 
																+                    for metric_name in scores
															
 
																+                    if metric_name.startswith("test")}
															
 
																-        scores = self._cross_validator(
															
 
																-                estimator=pipeline,
															
 
																-                X=self._X,
															
 
																-                y=self._y,
															
 
																-                cv=self._cv or 5,
															
 
																-                scoring=make_scorer(self._cost_func),
															
 
																-                error_score=np.nan)
															
 
																+            return {**scores_average, **scores_variance}
															
 
																-        return {'value': self._cross_val_averaging_func(scores['test_score']),
															
 
																-                'variance': np.var(scores['test_score'])}
															
 
																+        except Exception as e:
															
 
																+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
															
 
																+            self._logger.log_and_raise_error(err)
															
 
																     def _objective(self, space_element: dict) -> dict:
															
 
																         '''
															
@@ -354,79 +619,137 @@ class PipelineSelector(ABC):
 
																         start_time = time.time()
															
 
																-        if not self._data_attached:
															
 
																+        try:
															
 
																+            assert(self.attached_data)
															
 
																+        except AssertionError:
															
 
																             err = ("Data must be attached in order "
															
 
																                    "in order to effectuate the best"
															
 
																                    "pipeline search")
															
 
																             self._logger.log_and_raise_error(err)
															
 
																-        self._run_number += 1
															
 
																+        summary = {}
															
 
																+
															
 
																+        if self._strategy_name is not None:
															
 
																+            summary["strategy_name"] = self._strategy_name
															
 
																+
															
 
																+        if isinstance(self._cost_func, str):
															
 
																+            summary["cost_func"] = self._cost_func
															
 
																+
															
 
																+        elif hasattr(self._cost_func, "__name__"):
															
 
																+            summary["cost_func"] = self._cost_func.__name__
															
 
																+
															
 
																+        summary["trials_path"] = self.trials_path
															
 
																+
															
 
																+        if self._data_path is not None:
															
 
																+            summary["data_path"] = self._data_path
															
 
																+
															
 
																+        if self._cv_path is not None:
															
 
																+            summary["cv_path"] = self._cv_path
															
 
																-        pipeline = space_element['pipeline']
															
 
																-        params = space_element['params']
															
 
																-        pipeline.set_params(**params)
															
 
																+        summary["start_tuning_time"] = self.start_tuning_time
															
 
																-        self._logger.info(("Run number {0}: "
															
 
																-                           "Current score is {1}: "
															
 
																-                           "Training pipeline {2} "
															
 
																-                           "with parameters: {3}. ").format(
															
 
																-                             self._run_number,
															
 
																-                             self._best_score,
															
 
																-                             space_element['name'],
															
 
																-                             params))
															
 
																+        summary["iteration"] = self._iteration
															
 
																+
															
 
																+        backup_cond = (self._backup_trials_freq is not None) and\
															
 
																+            ((self._iteration - self._start_iteration - 1) %
															
 
																+             self._backup_trials_freq == 0) or\
															
 
																+            self._score_improved
															
 
																+
															
 
																+        if backup_cond:
															
 
																+            self._backup_trials()
															
 
																+            self._score_improved = False
															
 
																         try:
															
 
																+            pipeline = space_element['pipeline']
															
 
																+            params = space_element['params']
															
 
																+            pipeline.set_params(**params)
															
 
																+
															
 
																+            self._logger.info(("Iteration {0}: "
															
 
																+                               "Current score is {1}: "
															
 
																+                               "Training pipeline {2} "
															
 
																+                               "with parameters: {3}. ").format(
															
 
																+                                  self._iteration,
															
 
																+                                  self.best_score,
															
 
																+                                  space_element['name'],
															
 
																+                                  params))
															
 
																+
															
 
																             result = self._evaluate(pipeline)
															
 
																-            assert(not np.isnan(result["value"]))
															
 
																+            summary.update(result)
															
 
																-            if self._run_number % self._backup_trials_freq == 0:
															
 
																-                self._backup_trials()
															
 
																+            end_time = time.time()
															
 
																-            if (self._best_score != self._best_score) or\
															
 
																-                self._score_factor*result["value"] <\
															
 
																-                    self._score_factor*self._best_score:
															
 
																+            assert(not np.isnan(result["score"])),\
															
 
																+                "Score value is not in the output of the _evaluate method"
															
 
																-                self._logger.info("Score got better, new best score is: {}"
															
 
																-                                  .format(result["value"]))
															
 
																+            summary['status'] = STATUS_OK
															
 
																+            summary.update(result)
															
 
																+            summary['loss'] = self._score_factor * summary['score']
															
 
																+            summary['timestamp'] = datetime.datetime.today()
															
 
																+            summary['train_time'] = end_time - start_time
															
 
																-                self._best_score = result['value']
															
 
																+            self._iteration += 1
															
 
																-            end_time = time.time()
															
 
																+            self._score_improved = (self.best_score != self.best_score) or\
															
 
																+                                   (self._score_factor*result["score"] <
															
 
																+                                    self._score_factor*self.best_score)
															
 
																-            return {'loss': self._score_factor * result["value"],
															
 
																-                    'status': STATUS_OK,
															
 
																-                    'score': result["value"],
															
 
																-                    'score_variance': result["variance"],
															
 
																-                    'timestamp': datetime.datetime.today(),
															
 
																-                    'train_time': end_time - start_time}
															
 
																+            if self._score_improved:
															
 
																+
															
 
																+                self._logger.info("Score improved, new best score is: {}"
															
 
																+                                  .format(result["score"]))
															
 
																+
															
 
																+                self.best_score = result['score']
															
 
																+
															
 
																+                if self.configured_summary_saving:
															
 
																+                    self._save_summary(summary)
															
 
																         except Exception as e:
															
 
																             self._logger.warning("Trial failed with error {}".format(e))
															
 
																-            return {'loss': np.nan,
															
 
																-                    'status': STATUS_FAIL,
															
 
																-                    'score': np.nan,
															
 
																-                    'score_variance': np.nan,
															
 
																-                    'timestamp': datetime.datetime.today(),
															
 
																-                    'train_time': np.nan}
															
 
																+            summary['status'] = STATUS_FAIL
															
 
																+            summary['timestamp'] = datetime.datetime.today()
															
 
																+            summary['error'] = e
															
 
																+            for key in ['loss', 'score', 'score_variance', 'train_time']:
															
 
																+                summary[key] = np.nan
															
 
																+
															
 
																+        return summary
															
 
																     @abstractmethod
															
 
																     def run_trials(self):
															
 
																         """
															
 
																+        Method that runs the hyperparameter tuning over possibly multiple
															
 
																+        pipeline types specified in self.space
															
 
																+        When run_trials method is finished the flag self.finished_tuning
															
 
																+        should be set to True and the methods self._backup_trials and
															
 
																+        optionally self._save_result should be called.
															
 
																         """
															
 
																         pass
															
 
																     @abstractproperty
															
 
																-    def best_trial(self) -> float:
															
 
																+    def number_of_trials(self) -> int:
															
 
																         """
															
 
																+        Number of trials already run in the current trials object
															
 
																+        """
															
 
																+        pass
															
 
																+
															
 
																+    @abstractproperty
															
 
																+    def best_trial(self) -> dict:
															
 
																+        """
															
 
																+        Best trial sor far.
															
 
																+         Should contain the best pipeline,
															
 
																+         best hyperparameters,
															
 
																+         as well as an output of the self._objective method,
															
 
																+         but the exact form of the output depends on the implementation
															
 
																+         of the Trials object.
															
 
																         """
															
 
																         pass
															
 
																     @abstractproperty
															
 
																     def best_trial_score(self) -> float:
															
 
																         """
															
 
																+        Score of the best pipeline with the best hyperparameters
															
 
																         """
															
 
																         pass
															
@@ -439,5 +762,34 @@ class PipelineSelector(ABC):
 
																     @abstractproperty
															
 
																     def best_trial_pipeline(self) -> Pipeline:
															
 
																         """
															
 
																+        Best pipeline with best hyperparameters
															
 
																+        """
															
 
																+        pass
															
 
																+
															
 
																+    @abstractmethod
															
 
																+    def get_n_best_trial_pipelines(self, n: int) -> list:
															
 
																+        """
															
 
																+        N best pipelines with corresponding
															
 
																+        best hyperparameters
															
 
																+        """
															
 
																+        pass
															
 
																+
															
 
																+    @abstractmethod
															
 
																+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
															
 
																+        """
															
 
																+        If the hyperparameter search is done over multiple
															
 
																+        pipelines, then returns n different pipeline-types
															
 
																+        with corresponding hyperparameters
															
 
																+        """
															
 
																+        pass
															
 
																+
															
 
																+    @abstractmethod
															
 
																+    def trials_to_excel(self, path: str):
															
 
																+        """
															
 
																+        Trials object in the shape of table written to excel,
															
 
																+        should contain the iteration, pipeline (as str),
															
 
																+        hyperparamters (as str), self.best_result (see self._objective method)
															
 
																+        as well as additional information configured
															
 
																+        through self.save_result method.
															
 
																         """
															
 
																         pass