4 years ago · 28f4bcc46d
--- a/cdplib/pipeline_selector/PipelineSelector.py
+++ b/cdplib/pipeline_selector/PipelineSelector.py
@@ -5,11 +5,15 @@ Created on Wed Sep 30 14:23:23 2020
 
				 
			
 
				 @author: tanya
			
 
				 @description: an abstract class for selecting a machine learning
			
 
				- pipeline in a space of parameter distributions over multiple pipelines.
			
 
				- The selection is though in such a way that a Trials object is being
			
 
				+ pipeline from a space (deterministic or random) of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is thought in such a way that a Trials object is being
			
 
				  maintained during the tuning process from which one can retrieve
			
 
				- the best pipeline so far as well as the entire tuning history
			
 
				- if needed.
			
 
				+ the best pipeline so far
			
 
				+ as well as the entire tuning history if needed.
			
 
				+ Methods configure_cross_validation and configure_result_saving
			
 
				+ allow to use a custom cross-validation method and
			
 
				+ save the current best result in a file or database during training.
			
 
				  Children classes: hyperopt and custom gridsearch.
			
 
				 """
			
 
				 
			
@@ -18,12 +22,13 @@ import os
 
				 import sys
			
 
				 import time
			
 
				 import datetime
			
 
				-from typing import Callable
			
 
				 import numpy as np
			
 
				 import pandas as pd
			
 
				 from abc import ABC, abstractmethod, abstractproperty
			
 
				+from typing import Callable
			
 
				+import functools
			
 
				 from sklearn.pipeline import Pipeline
			
 
				-from sklearn.model_selection import cross_validate as sklearn_cross_validator
			
 
				+from sklearn.model_selection import cross_validate as sklearn_cross_validation
			
 
				 from sklearn.metrics import make_scorer
			
 
				 from hyperopt import STATUS_OK, STATUS_FAIL
			
 
				 from cdplib.log import Log
			
@@ -36,20 +41,28 @@ sys.path.append(os.getcwd())
 
				 class PipelineSelector(ABC):
			
 
				     """
			
 
				     An abstract class for selecting a machine learning
			
 
				-    pipeline in a space of parameter distributions over multiple pipelines.
			
 
				+    pipeline from a space (deterministic or random) of parameter
			
 
				+    distributions over multiple pipelines.
			
 
				     The selection is though in such a way that a Trials object is being
			
 
				     maintained during the tuning process from which one can retrieve
			
 
				     the best pipeline so far as well as the entire tuning history
			
 
				     if needed.
			
 
				+    Methods configure_cross_validation and configure_result_saving
			
 
				+    allow to use a custom cross-validation method and
			
 
				+    save the current best result in a file or database during training.
			
 
				+    Children classes: hyperopt and custom gridsearch.
			
 
				     """
			
 
				     def __init__(self,
			
 
				-                 cost_func,
			
 
				+                 cost_func: (Callable, str),
			
 
				                  greater_is_better: bool,
			
 
				                  trials_path: str,
			
 
				-                 backup_trials_freq: int = 1,
			
 
				-                 cross_val_averaging_func: callable = None):
			
 
				-        '''
			
 
				-        :param callable cost_func: function to minimize or maximize
			
 
				+                 backup_trials_freq: int = None,
			
 
				+                 cross_val_averaging_func: Callable = None,
			
 
				+                 additional_metrics: dict = None,
			
 
				+                 strategy_name: str = None,
			
 
				+                 stdout_log_level: str = "INFO"):
			
 
				+        """
			
 
				+        :param Callable cost_func: function to minimize or maximize
			
 
				 
			
 
				         :param bool greater_is_better: when True
			
 
				             cost_func is maximized, else minimized.
			
@@ -64,67 +77,98 @@ class PipelineSelector(ABC):
 
				 
			
 
				         :param backup_trials_freq: frequecy in interations (trials)
			
 
				             of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				 
			
 
				         :param str log_path: Optional, when not provided logs to stdout.
			
 
				 
			
 
				-        :param callable cross_val_averaging_func: optional,
			
 
				+        :param Callable cross_val_averaging_func: optional,
			
 
				             when not provided set to mean. Function
			
 
				             to aggregate the cross-validated values of the cost function.
			
 
				             Classic situation is to take the mean,
			
 
				             another example is, for example mean() - c*var().
			
 
				-        '''
			
 
				-        self._logger = Log("PipelineSelector: ")
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to save
			
 
				+            of the form {"metric_name": metric} where metric is a Callable.
			
 
				+
			
 
				+        :param str strategy_name: a name might be asigned to the trials,
			
 
				+            a strategy is defined by the data set, cv object, cost function.
			
 
				+            When the strategy changes, one should start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+        self._logger = Log("PipelineSelector: ",
			
 
				+                           stdout_log_level=stdout_log_level)
			
 
				 
			
 
				         input_errors = [(cost_func, Callable,
			
 
				-                         "Parameter 'cost_func' must be a callable"),
			
 
				+                         "Parameter 'cost_func' must be a Callable"),
			
 
				                         (greater_is_better, bool,
			
 
				                          "Parameter 'greater_is_better' must be bool type"),
			
 
				                         (trials_path, str,
			
 
				                          "Parameter 'trials_path' must be of string type"),
			
 
				                         (cross_val_averaging_func, (Callable, None.__class__),
			
 
				                          ("Parameter 'cross_val_averaging_func'"
			
 
				-                          "must be a callable")),
			
 
				-                        (backup_trials_freq, int,
			
 
				-                         "Parameter backup_trials_freq must be an int")]
			
 
				+                          "must be a Callable")),
			
 
				+                        (backup_trials_freq, (int, None.__class__),
			
 
				+                         "Parameter backup_trials_freq must be an int"),
			
 
				+                        (additional_metrics, (dict, None.__class__),
			
 
				+                         "Parameter additional_metrics must be a dict"),
			
 
				+                        (strategy_name, (str, None.__class__),
			
 
				+                         "Parameter strategy_name must be a str"),
			
 
				+                        (stdout_log_level, str,
			
 
				+                         "Parameter stdout_log_level must be a str")]
			
 
				 
			
 
				         for p, t, err in input_errors:
			
 
				             try:
			
 
				                 assert(isinstance(p, t))
			
 
				             except AssertionError:
			
 
				-                self._logger.log_and_raise_error(err)
			
 
				+                self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				+
			
 
				+        try:
			
 
				+            assert((additional_metrics is None) or
			
 
				+                   all([isinstance(metric, Callable)
			
 
				+                        for metric in additional_metrics.values()]))
			
 
				+        except AssertionError:
			
 
				+            err = "Metrics in additional_metrics must be Callables"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				 
			
 
				         ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
			
 
				 
			
 
				         self._cost_func = cost_func
			
 
				-        # is 1 when cost_func is minimized, -1 when cost func is maximized
			
 
				+        # score factor is 1 when cost_func is minimized,
			
 
				+        # -1 when cost func is maximized
			
 
				         self._score_factor = (not greater_is_better) - greater_is_better
			
 
				-        self._trials_path = trials_path
			
 
				+        self.trials_path = trials_path
			
 
				         self._backup_trials_freq = backup_trials_freq
			
 
				         self._cross_val_averaging_func = cross_val_averaging_func or np.mean
			
 
				-        # keeping track of the current search iteration
			
 
				-        self._run_number = 0
			
 
				-        # space and data need to be attached to perform search.
			
 
				-        self._space_attached = False
			
 
				-        self._data_attached = False
			
 
				-        self._cross_validator_attached = False
			
 
				-        # _best_score is the same as best_trial_score property
			
 
				-        # but is defined in order not to go through all the trials
			
 
				-        # at each iteration.
			
 
				-        self._best_score = np.nan
			
 
				+        self._additional_metrics = additional_metrics or {}
			
 
				+        self._strategy_name = strategy_name
			
 
				+        self._data_path = None
			
 
				+        self._cv_path = None
			
 
				+
			
 
				+        # best_score can be also read from trials
			
 
				+        # but is kept explicitely in order not to
			
 
				+        # search through the trials object every time
			
 
				+        # loss is the opposite of score
			
 
				+        self.best_score = np.nan
			
 
				+
			
 
				+        self._cross_validation = sklearn_cross_validation
			
 
				 
			
 
				         # if a trials object already exists at the given path,
			
 
				         # it is loaded and the search is continued. Else,
			
 
				         # the search is started from the beginning.
			
 
				-        if os.path.isfile(trials_path):
			
 
				+        if os.path.isfile(self.trials_path):
			
 
				             try:
			
 
				-                with open(trials_path, "rb") as f:
			
 
				+                with open(self.trials_path, "rb") as f:
			
 
				                     self._trials = pickle.load(f)
			
 
				 
			
 
				-                self._best_score = self.best_trial_score
			
 
				+                self._start_iteration = self.number_of_trials
			
 
				+
			
 
				+                self.best_score = self.best_trial_score
			
 
				 
			
 
				                 self._logger.info(("Loaded an existing trials object"
			
 
				                                    "Consisting of {} trials")
			
 
				-                                  .format(len(self._trials.trials)))
			
 
				+                                  .format(self._start_iteration))
			
 
				 
			
 
				             except Exception as e:
			
 
				                 err = ("Trials object could not be loaded. "
			
@@ -137,75 +181,142 @@ class PipelineSelector(ABC):
 
				                                   "Starting from scratch."))
			
 
				 
			
 
				             self._trials = None
			
 
				+            self._start_iteration = 0
			
 
				+
			
 
				+        self.attached_space = False
			
 
				+        self.attached_data = False
			
 
				+        self.configured_cross_validation = False
			
 
				+        self.configured_summary_saving = False
			
 
				+
			
 
				+        # keeping track of the current search iteration
			
 
				+        self._iteration = self._start_iteration
			
 
				+        self._score_improved = False
			
 
				+
			
 
				+        self.start_tuning_time = datetime.datetime.today()
			
 
				+        self.end_tuning_time = None
			
 
				+        self.finished_tuning = False
			
 
				 
			
 
				     def _backup_trials(self):
			
 
				         '''
			
 
				         Pickles (Saves) the trials object.
			
 
				         Used in a scheduler.
			
 
				         '''
			
 
				-        with open(self._trials_path, "wb") as f:
			
 
				-            pickle.dump(self._trials, f)
			
 
				+        try:
			
 
				+            with open(self.trials_path, "wb") as f:
			
 
				+                pickle.dump(self._trials, f)
			
 
				+        except Exception as e:
			
 
				+            err = "Could not backup trials. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				 
			
 
				-    def attach_cross_validator(self, cross_validator: Callable = None,
			
 
				-                               module_path: str = None,
			
 
				-                               name: str = None):
			
 
				+    def configure_cross_validation(self,
			
 
				+                                   cross_validation: Callable,
			
 
				+                                   kwargs: dict = None):
			
 
				         """
			
 
				         Method for attaching a custom cross-validation function
			
 
				-        :param cross_validator: a function that has the same
			
 
				+        :param cross_validation: a function that has the same
			
 
				              signature as sklearn.model_selection.cross_validate
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(isinstance(cross_validation, Callable))
			
 
				+        except AssertionError:
			
 
				+            err = "Parameter cross_validation must be a function"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				+
			
 
				+        try:
			
 
				+            kwargs = kwargs or {}
			
 
				+            assert(isinstance(kwargs, dict))
			
 
				+        except AssertionError:
			
 
				+            err = "Paramter kwargs must be a dict"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				+
			
 
				+        try:
			
 
				+            self._cross_validation = functools.partial(
			
 
				+                    self._cross_validation, **kwargs)
			
 
				+
			
 
				+            self.configured_cross_validation = True
			
 
				+
			
 
				+            if hasattr(cross_validation, "__name__"):
			
 
				+                self.best_result["cross_validation"] =\
			
 
				+                    cross_validation.__name__
			
 
				+
			
 
				+            self._logger.info("Configured cross validation")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to configure cross-validation. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def configure_cross_validation_from_module(self,
			
 
				+                                               module_path: str,
			
 
				+                                               name: str):
			
 
				+        """
			
 
				         :param str module_path: path to python module
			
 
				-            where the space is defined. Optional when
			
 
				-            the space is provided directly.
			
 
				+            where the cross_validation function is defined.
			
 
				 
			
 
				-        :param str name: name of the space loaded from
			
 
				-            a python module. Optional when the space
			
 
				-            is provided directly.
			
 
				+        :param str name: name of the cross validation function
			
 
				+            loaded froma python module.
			
 
				         """
			
 
				         try:
			
 
				-            assert((cross_validator is not None) or
			
 
				-                   ((module_path is not None) and (name is not None)))
			
 
				+            assert(isinstance(module_path, str) and
			
 
				+                   isinstance(name, str))
			
 
				         except AssertionError:
			
 
				-            err = ("Either cross_validator or "
			
 
				-                   "(module_path, name) must be provided")
			
 
				-            self._logger.log_and_raise_error(err)
			
 
				+            err = "Parameters module_path and name must be of str type"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				 
			
 
				-        self._cross_validator = cross_validator or\
			
 
				-            LoadingUtils().load_from_module(module_path=module_path, name=name)
			
 
				+        try:
			
 
				+            self._cross_validation = \
			
 
				+                LoadingUtils().load_from_module(
			
 
				+                        module_path=module_path, name=name)
			
 
				 
			
 
				-        self._logger.info("Attached a cross validator")
			
 
				-        self._cross_validator_attached = True
			
 
				+            self.configured_cross_validation = True
			
 
				 
			
 
				-    def attach_space(self, space=None,
			
 
				-                     module_path: str = None,
			
 
				-                     name: str = None):
			
 
				-        '''
			
 
				+            self.best_result["cross_validation"] = name
			
 
				+
			
 
				+            self._logger.info("Configured cross validation")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to load cross-validation from module. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+            self._logger.log_and_raise_error(e)
			
 
				+
			
 
				+    def attach_space(self, space):
			
 
				+        """
			
 
				         :param space: space where
			
 
				-            the search is performed. Optional when a space
			
 
				-            is loaded from a python module. A space might be either
			
 
				+            the search is performed. A space might be either
			
 
				             a list of dictionaries or a hyperopt space object
			
 
				             the elements of which are dictionaries with keys:
			
 
				             name, pipeline, params
			
 
				+        """
			
 
				+        self._space = space
			
 
				+        self._logger.info("Attached parameter distribution space")
			
 
				+        self.attached_space = True
			
 
				 
			
 
				+    def attach_space_from_module(self, module_path: str, name: str):
			
 
				+        """
			
 
				         :param str module_path: path to python module
			
 
				-            where the space is defined. Optional when
			
 
				-            the space is provided directly.
			
 
				+            where the space is defined.
			
 
				 
			
 
				         :param str name: name of the space loaded from
			
 
				-            a python module. Optional when the space
			
 
				-            is provided directly.
			
 
				-        '''
			
 
				+            a python module.
			
 
				+        """
			
 
				         try:
			
 
				-            assert((space is not None) or
			
 
				-                   ((module_path is not None) and (name is not None)))
			
 
				+            assert(isinstance(module_path, str) and
			
 
				+                   isinstance(name, str))
			
 
				         except AssertionError:
			
 
				-            err = "Either space or (module_path, name) must be provided"
			
 
				-            self._logger.log_and_raise_error(err)
			
 
				+            err = "Parameters module_path and name must be of str type"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				 
			
 
				-        self._space = space or LoadingUtils().load_from_module(
			
 
				-                module_path=module_path, name=name)
			
 
				+        try:
			
 
				+            self._space = LoadingUtils().load_from_module(
			
 
				+                    module_path=module_path, name=name)
			
 
				 
			
 
				-        self._logger.info("Attached parameter distribution space")
			
 
				-        self._space_attached = True
			
 
				+            self._logger.info("Attached parameter distribution space")
			
 
				+
			
 
				+            self.attached_space = True
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach space from module. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+            self._logger.loger_and_raise_error(err)
			
 
				 
			
 
				     def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
			
 
				                     y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
@@ -242,25 +353,28 @@ class PipelineSelector(ABC):
 
				                        isinstance(y_train, (pd.Series, np.ndarray,
			
 
				                                             pd.DataFrame, NoneType)) and
			
 
				                        isinstance(y_val, (pd.Series, np.ndarray)) and
			
 
				-                       ((y_val is None) if (y_train is None)
			
 
				-                        else (y_val is not None)))
			
 
				+                       (y_val is None) == (y_train is None))
			
 
				             except AssertionError:
			
 
				                 self._logger.log_and_raise_error(input_err)
			
 
				 
			
 
				-            # cost is evaluated with a cross validation function
			
 
				-            # that accepts an array and a cv object with
			
 
				-            # indices of the fold splits.
			
 
				-            # Here we create a trivial cv object
			
 
				-            # with one validation split.
			
 
				-
			
 
				-            train_inds = list(range(len(X_train)))
			
 
				-            val_inds = list(range(len(X_train),
			
 
				-                                  len(X_train) + len(X_val)))
			
 
				-
			
 
				-            self._cv = [(train_inds, val_inds)]
			
 
				-            self._X = np.concatenate([X_train, X_val])
			
 
				-            self._y = None if y_train is None\
			
 
				-                else np.concatenate([y_train, y_val])
			
 
				+            try:
			
 
				+                # cost is evaluated with a cross validation function
			
 
				+                # that accepts an array and a cv object with
			
 
				+                # indices of the fold splits.
			
 
				+                # Here we create a trivial cv object
			
 
				+                # with one validation split.
			
 
				+
			
 
				+                train_inds = list(range(len(X_train)))
			
 
				+                val_inds = list(range(len(X_train),
			
 
				+                                      len(X_train) + len(X_val)))
			
 
				+
			
 
				+                self._cv = [(train_inds, val_inds)]
			
 
				+                self._X = np.concatenate([X_train, X_val])
			
 
				+                self._y = None if y_train is None\
			
 
				+                    else np.concatenate([y_train, y_val])
			
 
				+            except Exception as e:
			
 
				+                err = "Failed to attach data. Exit with error: {}".format(e)
			
 
				+                self._logger.log_and_raise_error(err)
			
 
				 
			
 
				         else:
			
 
				             try:
			
@@ -276,10 +390,139 @@ class PipelineSelector(ABC):
 
				             self._y = y_train
			
 
				 
			
 
				         self._logger.info("Attached data")
			
 
				-        self._data_attached = True
			
 
				+        self.attached_data = True
			
 
				 
			
 
				-    def _evaluate(self, pipeline: Pipeline) -> dict:
			
 
				-        '''
			
 
				+    def attach_data_from_hdf5(self,
			
 
				+                              data_hdf5_store_path: str,
			
 
				+                              cv_pickle_path: str = None):
			
 
				+        """
			
 
				+        Method for attaching data from a hdf5 store.
			
 
				+             The hdf5 store is a binary file,
			
 
				+             after loading it, it is a dictionary with keys
			
 
				+             X_train (y_train, X_val, y_val). The cv is loaded
			
 
				+             from a pickle file. The reason to separate the data
			
 
				+             store from the cv store, is the hdf5 is optimized to
			
 
				+             store large dataframes (especially with simple types) and
			
 
				+             a a small list of lists like a cv-object is better
			
 
				+             to be stored as a pickle file.
			
 
				+        :param str data_hdf5_store_path: path to the hdf5 store
			
 
				+            with train and validation data
			
 
				+        :param str cv_pickle_path: path to the pickle file with
			
 
				+            the cv data
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(os.path.isfile(data_hdf5_store_path))
			
 
				+        except AssertionError:
			
 
				+            err = "Parameter hdf5_store_path is not a file"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				+
			
 
				+        # load the hdf5 store
			
 
				+        try:
			
 
				+            store = pd.HDFStore(data_hdf5_store_path)
			
 
				+            self._data_path = data_hdf5_store_path
			
 
				+        except Exception as e:
			
 
				+            err = "Could not load the hdf5 store. Exit with error: {}."\
			
 
				+                .format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+        data_input = {}
			
 
				+
			
 
				+        for key in ["/X_train", "/y_train", "/X_val", "/y_val"]:
			
 
				+            if key not in store.keys():
			
 
				+                data_input[key.replace("/", "")] = None
			
 
				+            else:
			
 
				+                data_input[key.replace("/", "")] = store[key]
			
 
				+
			
 
				+        if cv_pickle_path is not None:
			
 
				+            try:
			
 
				+                assert(os.path.isfile(cv_pickle_path))
			
 
				+            except AssertionError:
			
 
				+                err = "Parameter hdf5_store_path is not a file"
			
 
				+                self._logger.log_and_raise_error(err, ErrorType=NameError)
			
 
				+
			
 
				+            try:
			
 
				+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
			
 
				+                self._cv_path = cv_pickle_path
			
 
				+            except Exception as e:
			
 
				+                err = "Could not load the pickeled cv. Exit with error: {}."\
			
 
				+                    .format(e)
			
 
				+                self._logger.log_and_raise_error(err)
			
 
				+        else:
			
 
				+            data_input["cv"] = None
			
 
				+
			
 
				+        self.attach_data(**data_input)
			
 
				+
			
 
				+        store.close()
			
 
				+
			
 
				+    def configer_summary_saving(self,
			
 
				+                                save_method: Callable = None,
			
 
				+                                kwargs: dict = None):
			
 
				+        """
			
 
				+        Attaching a method for saving information about
			
 
				+             the trials/space/strategy and the result of
			
 
				+             the current best pipeline. This method can
			
 
				+             save the result in a txt or a json file,
			
 
				+             or in a database for example. Arguments like
			
 
				+             file path or the table name can be specified in kwargs.
			
 
				+        :param Callable save_method: method for saving the result
			
 
				+            of the pipeline selection. The method must accept
			
 
				+            a pandas DataFrame as argument. See self._save_result
			
 
				+            method for the format of the argument being saved.
			
 
				+            By default, saving to a csv file.
			
 
				+            Examples:
			
 
				+                functools.partial(pd.DataFrame.to_csv,
			
 
				+                                  **{"path_or_buf": <PATH>})
			
 
				+                functools.partial(np.savetxt, **{"fname": <PATH>})
			
 
				+
			
 
				+                functools.partial(SQLHandler(<URI>).append_to_table,
			
 
				+                                  **{"tablename": <NAME>})
			
 
				+
			
 
				+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
			
 
				+                                  **{"collection_name": <NAME>})
			
 
				+
			
 
				+            using functools can be avoided by providing the kwarg argument
			
 
				+        :param dict kwargs: a dictionary with keyword arguments
			
 
				+            (like tablename) to provide to the save_method
			
 
				+        """
			
 
				+        try:
			
 
				+            save_method = save_method or functools.partial(
			
 
				+                    pd.DataFrame.to_excel, **{"path_or_buf": "result.csv"})
			
 
				+
			
 
				+            kwargs = kwargs or {}
			
 
				+
			
 
				+            self._save_method = functools.partial(save_method, **kwargs)
			
 
				+
			
 
				+            self.configured_summary_saving = True
			
 
				+
			
 
				+            self._logger.info("Configured summary saving")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to configure the summary saving. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _save_summary(self, summary: dict):
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self.configured_summary_saving)
			
 
				+        except AssertionError:
			
 
				+            err = "Result saving must be configured first"
			
 
				+            self._logger.log_and_raise_error(err, ErrorType=AssertionError)
			
 
				+
			
 
				+        try:
			
 
				+            self._save_method(summary)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not configure summary saving. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _evaluate(self, pipeline: Pipeline,
			
 
				+                  scoring: Callable = None,
			
 
				+                  cross_validation: Callable = None) -> dict:
			
 
				+        """
			
 
				         This method is called in _objective.
			
 
				 
			
 
				         Calculates the cost on the attached data.
			
@@ -289,24 +532,46 @@ class PipelineSelector(ABC):
 
				 
			
 
				         :param Pipeline pipeline: machine learning pipeline
			
 
				             that will be evaluated with cross-validation
			
 
				+        :param cross_validation: a function that has the same
			
 
				+             signature as sklearn.model_selection.cross_validate
			
 
				 
			
 
				-        :output: dictionary with the aggregated
			
 
				+        :return: dictionary with the aggregated
			
 
				             cross-validation score and
			
 
				             the score variance.
			
 
				-        '''
			
 
				-        if not self._cross_validator_attached:
			
 
				-            self._cross_validator = sklearn_cross_validator
			
 
				+        """
			
 
				+        try:
			
 
				+
			
 
				+            scoring = {"score": make_scorer(self._cost_func)}
			
 
				+
			
 
				+            scoring.update({metric_name: make_scorer(metric)
			
 
				+                            for metric_name, metric
			
 
				+                            in self._additional_metrics.items()})
			
 
				+
			
 
				+            scores = self._cross_validation(
			
 
				+                    estimator=pipeline,
			
 
				+                    X=self._X,
			
 
				+                    y=self._y,
			
 
				+                    cv=self._cv or 5,
			
 
				+                    scoring=scoring,
			
 
				+                    error_score=np.nan)
			
 
				+
			
 
				+            scores_average = {
			
 
				+                    metric_name.replace("test_", ""):
			
 
				+                    self._cross_val_averaging_func(scores[metric_name])
			
 
				+                    for metric_name in scores
			
 
				+                    if metric_name.startswith("test")}
			
 
				+
			
 
				+            scores_variance = {
			
 
				+                    metric_name.replace("test_", "") + "_variance":
			
 
				+                    np.var(scores[metric_name])
			
 
				+                    for metric_name in scores
			
 
				+                    if metric_name.startswith("test")}
			
 
				 
			
 
				-        scores = self._cross_validator(
			
 
				-                estimator=pipeline,
			
 
				-                X=self._X,
			
 
				-                y=self._y,
			
 
				-                cv=self._cv or 5,
			
 
				-                scoring=make_scorer(self._cost_func),
			
 
				-                error_score=np.nan)
			
 
				+            return {**scores_average, **scores_variance}
			
 
				 
			
 
				-        return {'value': self._cross_val_averaging_func(scores['test_score']),
			
 
				-                'variance': np.var(scores['test_score'])}
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				 
			
 
				     def _objective(self, space_element: dict) -> dict:
			
 
				         '''
			
@@ -354,79 +619,137 @@ class PipelineSelector(ABC):
 
				 
			
 
				         start_time = time.time()
			
 
				 
			
 
				-        if not self._data_attached:
			
 
				+        try:
			
 
				+            assert(self.attached_data)
			
 
				+        except AssertionError:
			
 
				             err = ("Data must be attached in order "
			
 
				                    "in order to effectuate the best"
			
 
				                    "pipeline search")
			
 
				             self._logger.log_and_raise_error(err)
			
 
				 
			
 
				-        self._run_number += 1
			
 
				+        summary = {}
			
 
				+
			
 
				+        if self._strategy_name is not None:
			
 
				+            summary["strategy_name"] = self._strategy_name
			
 
				+
			
 
				+        if isinstance(self._cost_func, str):
			
 
				+            summary["cost_func"] = self._cost_func
			
 
				+
			
 
				+        elif hasattr(self._cost_func, "__name__"):
			
 
				+            summary["cost_func"] = self._cost_func.__name__
			
 
				+
			
 
				+        summary["trials_path"] = self.trials_path
			
 
				+
			
 
				+        if self._data_path is not None:
			
 
				+            summary["data_path"] = self._data_path
			
 
				+
			
 
				+        if self._cv_path is not None:
			
 
				+            summary["cv_path"] = self._cv_path
			
 
				 
			
 
				-        pipeline = space_element['pipeline']
			
 
				-        params = space_element['params']
			
 
				-        pipeline.set_params(**params)
			
 
				+        summary["start_tuning_time"] = self.start_tuning_time
			
 
				 
			
 
				-        self._logger.info(("Run number {0}: "
			
 
				-                           "Current score is {1}: "
			
 
				-                           "Training pipeline {2} "
			
 
				-                           "with parameters: {3}. ").format(
			
 
				-                             self._run_number,
			
 
				-                             self._best_score,
			
 
				-                             space_element['name'],
			
 
				-                             params))
			
 
				+        summary["iteration"] = self._iteration
			
 
				+
			
 
				+        backup_cond = (self._backup_trials_freq is not None) and\
			
 
				+            ((self._iteration - self._start_iteration - 1) %
			
 
				+             self._backup_trials_freq == 0) or\
			
 
				+            self._score_improved
			
 
				+
			
 
				+        if backup_cond:
			
 
				+            self._backup_trials()
			
 
				+            self._score_improved = False
			
 
				 
			
 
				         try:
			
 
				+            pipeline = space_element['pipeline']
			
 
				+            params = space_element['params']
			
 
				+            pipeline.set_params(**params)
			
 
				+
			
 
				+            self._logger.info(("Iteration {0}: "
			
 
				+                               "Current score is {1}: "
			
 
				+                               "Training pipeline {2} "
			
 
				+                               "with parameters: {3}. ").format(
			
 
				+                                  self._iteration,
			
 
				+                                  self.best_score,
			
 
				+                                  space_element['name'],
			
 
				+                                  params))
			
 
				+
			
 
				             result = self._evaluate(pipeline)
			
 
				 
			
 
				-            assert(not np.isnan(result["value"]))
			
 
				+            summary.update(result)
			
 
				 
			
 
				-            if self._run_number % self._backup_trials_freq == 0:
			
 
				-                self._backup_trials()
			
 
				+            end_time = time.time()
			
 
				 
			
 
				-            if (self._best_score != self._best_score) or\
			
 
				-                self._score_factor*result["value"] <\
			
 
				-                    self._score_factor*self._best_score:
			
 
				+            assert(not np.isnan(result["score"])),\
			
 
				+                "Score value is not in the output of the _evaluate method"
			
 
				 
			
 
				-                self._logger.info("Score got better, new best score is: {}"
			
 
				-                                  .format(result["value"]))
			
 
				+            summary['status'] = STATUS_OK
			
 
				+            summary.update(result)
			
 
				+            summary['loss'] = self._score_factor * summary['score']
			
 
				+            summary['timestamp'] = datetime.datetime.today()
			
 
				+            summary['train_time'] = end_time - start_time
			
 
				 
			
 
				-                self._best_score = result['value']
			
 
				+            self._iteration += 1
			
 
				 
			
 
				-            end_time = time.time()
			
 
				+            self._score_improved = (self.best_score != self.best_score) or\
			
 
				+                                   (self._score_factor*result["score"] <
			
 
				+                                    self._score_factor*self.best_score)
			
 
				 
			
 
				-            return {'loss': self._score_factor * result["value"],
			
 
				-                    'status': STATUS_OK,
			
 
				-                    'score': result["value"],
			
 
				-                    'score_variance': result["variance"],
			
 
				-                    'timestamp': datetime.datetime.today(),
			
 
				-                    'train_time': end_time - start_time}
			
 
				+            if self._score_improved:
			
 
				+
			
 
				+                self._logger.info("Score improved, new best score is: {}"
			
 
				+                                  .format(result["score"]))
			
 
				+
			
 
				+                self.best_score = result['score']
			
 
				+
			
 
				+                if self.configured_summary_saving:
			
 
				+                    self._save_summary(summary)
			
 
				 
			
 
				         except Exception as e:
			
 
				 
			
 
				             self._logger.warning("Trial failed with error {}".format(e))
			
 
				 
			
 
				-            return {'loss': np.nan,
			
 
				-                    'status': STATUS_FAIL,
			
 
				-                    'score': np.nan,
			
 
				-                    'score_variance': np.nan,
			
 
				-                    'timestamp': datetime.datetime.today(),
			
 
				-                    'train_time': np.nan}
			
 
				+            summary['status'] = STATUS_FAIL
			
 
				+            summary['timestamp'] = datetime.datetime.today()
			
 
				+            summary['error'] = e
			
 
				+            for key in ['loss', 'score', 'score_variance', 'train_time']:
			
 
				+                summary[key] = np.nan
			
 
				+
			
 
				+        return summary
			
 
				 
			
 
				     @abstractmethod
			
 
				     def run_trials(self):
			
 
				         """
			
 
				+        Method that runs the hyperparameter tuning over possibly multiple
			
 
				+        pipeline types specified in self.space
			
 
				+        When run_trials method is finished the flag self.finished_tuning
			
 
				+        should be set to True and the methods self._backup_trials and
			
 
				+        optionally self._save_result should be called.
			
 
				         """
			
 
				         pass
			
 
				 
			
 
				     @abstractproperty
			
 
				-    def best_trial(self) -> float:
			
 
				+    def number_of_trials(self) -> int:
			
 
				         """
			
 
				+        Number of trials already run in the current trials object
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial(self) -> dict:
			
 
				+        """
			
 
				+        Best trial sor far.
			
 
				+         Should contain the best pipeline,
			
 
				+         best hyperparameters,
			
 
				+         as well as an output of the self._objective method,
			
 
				+         but the exact form of the output depends on the implementation
			
 
				+         of the Trials object.
			
 
				         """
			
 
				         pass
			
 
				 
			
 
				     @abstractproperty
			
 
				     def best_trial_score(self) -> float:
			
 
				         """
			
 
				+        Score of the best pipeline with the best hyperparameters
			
 
				         """
			
 
				         pass
			
 
				 
			
@@ -439,5 +762,34 @@ class PipelineSelector(ABC):
 
				     @abstractproperty
			
 
				     def best_trial_pipeline(self) -> Pipeline:
			
 
				         """
			
 
				+        Best pipeline with best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_n_best_trial_pipelines(self, n: int) -> list:
			
 
				+        """
			
 
				+        N best pipelines with corresponding
			
 
				+        best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
			
 
				+        """
			
 
				+        If the hyperparameter search is done over multiple
			
 
				+        pipelines, then returns n different pipeline-types
			
 
				+        with corresponding hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def trials_to_excel(self, path: str):
			
 
				+        """
			
 
				+        Trials object in the shape of table written to excel,
			
 
				+        should contain the iteration, pipeline (as str),
			
 
				+        hyperparamters (as str), self.best_result (see self._objective method)
			
 
				+        as well as additional information configured
			
 
				+        through self.save_result method.
			
 
				         """
			
 
				         pass