Pārlūkot izejas kodu

change in the abstract PipelineSelector class

tanja 4 gadi atpakaļ
vecāks
revīzija
28f4bcc46d
1 mainītis faili ar 495 papildinājumiem un 143 dzēšanām
  1. 495 143
      cdplib/pipeline_selector/PipelineSelector.py

+ 495 - 143
cdplib/pipeline_selector/PipelineSelector.py

@@ -5,11 +5,15 @@ Created on Wed Sep 30 14:23:23 2020
 
 
 @author: tanya
 @author: tanya
 @description: an abstract class for selecting a machine learning
 @description: an abstract class for selecting a machine learning
- pipeline in a space of parameter distributions over multiple pipelines.
- The selection is though in such a way that a Trials object is being
+ pipeline from a space (deterministic or random) of parameter distributions
+ over multiple pipelines.
+ The selection is thought in such a way that a Trials object is being
  maintained during the tuning process from which one can retrieve
  maintained during the tuning process from which one can retrieve
- the best pipeline so far as well as the entire tuning history
- if needed.
+ the best pipeline so far
+ as well as the entire tuning history if needed.
+ Methods configure_cross_validation and configure_result_saving
+ allow to use a custom cross-validation method and
+ save the current best result in a file or database during training.
  Children classes: hyperopt and custom gridsearch.
  Children classes: hyperopt and custom gridsearch.
 """
 """
 
 
@@ -18,12 +22,13 @@ import os
 import sys
 import sys
 import time
 import time
 import datetime
 import datetime
-from typing import Callable
 import numpy as np
 import numpy as np
 import pandas as pd
 import pandas as pd
 from abc import ABC, abstractmethod, abstractproperty
 from abc import ABC, abstractmethod, abstractproperty
+from typing import Callable
+import functools
 from sklearn.pipeline import Pipeline
 from sklearn.pipeline import Pipeline
-from sklearn.model_selection import cross_validate as sklearn_cross_validator
+from sklearn.model_selection import cross_validate as sklearn_cross_validation
 from sklearn.metrics import make_scorer
 from sklearn.metrics import make_scorer
 from hyperopt import STATUS_OK, STATUS_FAIL
 from hyperopt import STATUS_OK, STATUS_FAIL
 from cdplib.log import Log
 from cdplib.log import Log
@@ -36,20 +41,28 @@ sys.path.append(os.getcwd())
 class PipelineSelector(ABC):
 class PipelineSelector(ABC):
     """
     """
     An abstract class for selecting a machine learning
     An abstract class for selecting a machine learning
-    pipeline in a space of parameter distributions over multiple pipelines.
+    pipeline from a space (deterministic or random) of parameter
+    distributions over multiple pipelines.
     The selection is though in such a way that a Trials object is being
     The selection is though in such a way that a Trials object is being
     maintained during the tuning process from which one can retrieve
     maintained during the tuning process from which one can retrieve
     the best pipeline so far as well as the entire tuning history
     the best pipeline so far as well as the entire tuning history
     if needed.
     if needed.
+    Methods configure_cross_validation and configure_result_saving
+    allow to use a custom cross-validation method and
+    save the current best result in a file or database during training.
+    Children classes: hyperopt and custom gridsearch.
     """
     """
     def __init__(self,
     def __init__(self,
-                 cost_func,
+                 cost_func: (Callable, str),
                  greater_is_better: bool,
                  greater_is_better: bool,
                  trials_path: str,
                  trials_path: str,
-                 backup_trials_freq: int = 1,
-                 cross_val_averaging_func: callable = None):
-        '''
-        :param callable cost_func: function to minimize or maximize
+                 backup_trials_freq: int = None,
+                 cross_val_averaging_func: Callable = None,
+                 additional_metrics: dict = None,
+                 strategy_name: str = None,
+                 stdout_log_level: str = "INFO"):
+        """
+        :param Callable cost_func: function to minimize or maximize
 
 
         :param bool greater_is_better: when True
         :param bool greater_is_better: when True
             cost_func is maximized, else minimized.
             cost_func is maximized, else minimized.
@@ -64,67 +77,98 @@ class PipelineSelector(ABC):
 
 
         :param backup_trials_freq: frequecy in interations (trials)
         :param backup_trials_freq: frequecy in interations (trials)
             of saving the trials object at the trials_path.
             of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
 
 
         :param str log_path: Optional, when not provided logs to stdout.
         :param str log_path: Optional, when not provided logs to stdout.
 
 
-        :param callable cross_val_averaging_func: optional,
+        :param Callable cross_val_averaging_func: optional,
             when not provided set to mean. Function
             when not provided set to mean. Function
             to aggregate the cross-validated values of the cost function.
             to aggregate the cross-validated values of the cost function.
             Classic situation is to take the mean,
             Classic situation is to take the mean,
             another example is, for example mean() - c*var().
             another example is, for example mean() - c*var().
-        '''
-        self._logger = Log("PipelineSelector: ")
+
+        :param additional_metics: dict of additional metrics to save
+            of the form {"metric_name": metric} where metric is a Callable.
+
+        :param str strategy_name: a name might be asigned to the trials,
+            a strategy is defined by the data set, cv object, cost function.
+            When the strategy changes, one should start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+        self._logger = Log("PipelineSelector: ",
+                           stdout_log_level=stdout_log_level)
 
 
         input_errors = [(cost_func, Callable,
         input_errors = [(cost_func, Callable,
-                         "Parameter 'cost_func' must be a callable"),
+                         "Parameter 'cost_func' must be a Callable"),
                         (greater_is_better, bool,
                         (greater_is_better, bool,
                          "Parameter 'greater_is_better' must be bool type"),
                          "Parameter 'greater_is_better' must be bool type"),
                         (trials_path, str,
                         (trials_path, str,
                          "Parameter 'trials_path' must be of string type"),
                          "Parameter 'trials_path' must be of string type"),
                         (cross_val_averaging_func, (Callable, None.__class__),
                         (cross_val_averaging_func, (Callable, None.__class__),
                          ("Parameter 'cross_val_averaging_func'"
                          ("Parameter 'cross_val_averaging_func'"
-                          "must be a callable")),
-                        (backup_trials_freq, int,
-                         "Parameter backup_trials_freq must be an int")]
+                          "must be a Callable")),
+                        (backup_trials_freq, (int, None.__class__),
+                         "Parameter backup_trials_freq must be an int"),
+                        (additional_metrics, (dict, None.__class__),
+                         "Parameter additional_metrics must be a dict"),
+                        (strategy_name, (str, None.__class__),
+                         "Parameter strategy_name must be a str"),
+                        (stdout_log_level, str,
+                         "Parameter stdout_log_level must be a str")]
 
 
         for p, t, err in input_errors:
         for p, t, err in input_errors:
             try:
             try:
                 assert(isinstance(p, t))
                 assert(isinstance(p, t))
             except AssertionError:
             except AssertionError:
-                self._logger.log_and_raise_error(err)
+                self._logger.log_and_raise_error(err, ErrorType=NameError)
+
+        try:
+            assert((additional_metrics is None) or
+                   all([isinstance(metric, Callable)
+                        for metric in additional_metrics.values()]))
+        except AssertionError:
+            err = "Metrics in additional_metrics must be Callables"
+            self._logger.log_and_raise_error(err, ErrorType=NameError)
 
 
         ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
         ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
 
 
         self._cost_func = cost_func
         self._cost_func = cost_func
-        # is 1 when cost_func is minimized, -1 when cost func is maximized
+        # score factor is 1 when cost_func is minimized,
+        # -1 when cost func is maximized
         self._score_factor = (not greater_is_better) - greater_is_better
         self._score_factor = (not greater_is_better) - greater_is_better
-        self._trials_path = trials_path
+        self.trials_path = trials_path
         self._backup_trials_freq = backup_trials_freq
         self._backup_trials_freq = backup_trials_freq
         self._cross_val_averaging_func = cross_val_averaging_func or np.mean
         self._cross_val_averaging_func = cross_val_averaging_func or np.mean
-        # keeping track of the current search iteration
-        self._run_number = 0
-        # space and data need to be attached to perform search.
-        self._space_attached = False
-        self._data_attached = False
-        self._cross_validator_attached = False
-        # _best_score is the same as best_trial_score property
-        # but is defined in order not to go through all the trials
-        # at each iteration.
-        self._best_score = np.nan
+        self._additional_metrics = additional_metrics or {}
+        self._strategy_name = strategy_name
+        self._data_path = None
+        self._cv_path = None
+
+        # best_score can be also read from trials
+        # but is kept explicitely in order not to
+        # search through the trials object every time
+        # loss is the opposite of score
+        self.best_score = np.nan
+
+        self._cross_validation = sklearn_cross_validation
 
 
         # if a trials object already exists at the given path,
         # if a trials object already exists at the given path,
         # it is loaded and the search is continued. Else,
         # it is loaded and the search is continued. Else,
         # the search is started from the beginning.
         # the search is started from the beginning.
-        if os.path.isfile(trials_path):
+        if os.path.isfile(self.trials_path):
             try:
             try:
-                with open(trials_path, "rb") as f:
+                with open(self.trials_path, "rb") as f:
                     self._trials = pickle.load(f)
                     self._trials = pickle.load(f)
 
 
-                self._best_score = self.best_trial_score
+                self._start_iteration = self.number_of_trials
+
+                self.best_score = self.best_trial_score
 
 
                 self._logger.info(("Loaded an existing trials object"
                 self._logger.info(("Loaded an existing trials object"
                                    "Consisting of {} trials")
                                    "Consisting of {} trials")
-                                  .format(len(self._trials.trials)))
+                                  .format(self._start_iteration))
 
 
             except Exception as e:
             except Exception as e:
                 err = ("Trials object could not be loaded. "
                 err = ("Trials object could not be loaded. "
@@ -137,75 +181,142 @@ class PipelineSelector(ABC):
                                   "Starting from scratch."))
                                   "Starting from scratch."))
 
 
             self._trials = None
             self._trials = None
+            self._start_iteration = 0
+
+        self.attached_space = False
+        self.attached_data = False
+        self.configured_cross_validation = False
+        self.configured_summary_saving = False
+
+        # keeping track of the current search iteration
+        self._iteration = self._start_iteration
+        self._score_improved = False
+
+        self.start_tuning_time = datetime.datetime.today()
+        self.end_tuning_time = None
+        self.finished_tuning = False
 
 
     def _backup_trials(self):
     def _backup_trials(self):
         '''
         '''
         Pickles (Saves) the trials object.
         Pickles (Saves) the trials object.
         Used in a scheduler.
         Used in a scheduler.
         '''
         '''
-        with open(self._trials_path, "wb") as f:
-            pickle.dump(self._trials, f)
+        try:
+            with open(self.trials_path, "wb") as f:
+                pickle.dump(self._trials, f)
+        except Exception as e:
+            err = "Could not backup trials. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
 
 
-    def attach_cross_validator(self, cross_validator: Callable = None,
-                               module_path: str = None,
-                               name: str = None):
+    def configure_cross_validation(self,
+                                   cross_validation: Callable,
+                                   kwargs: dict = None):
         """
         """
         Method for attaching a custom cross-validation function
         Method for attaching a custom cross-validation function
-        :param cross_validator: a function that has the same
+        :param cross_validation: a function that has the same
              signature as sklearn.model_selection.cross_validate
              signature as sklearn.model_selection.cross_validate
+        """
+        try:
+            assert(isinstance(cross_validation, Callable))
+        except AssertionError:
+            err = "Parameter cross_validation must be a function"
+            self._logger.log_and_raise_error(err, ErrorType=NameError)
+
+        try:
+            kwargs = kwargs or {}
+            assert(isinstance(kwargs, dict))
+        except AssertionError:
+            err = "Paramter kwargs must be a dict"
+            self._logger.log_and_raise_error(err, ErrorType=NameError)
+
+        try:
+            self._cross_validation = functools.partial(
+                    self._cross_validation, **kwargs)
+
+            self.configured_cross_validation = True
+
+            if hasattr(cross_validation, "__name__"):
+                self.best_result["cross_validation"] =\
+                    cross_validation.__name__
+
+            self._logger.info("Configured cross validation")
+
+        except Exception as e:
+            err = ("Failed to configure cross-validation. "
+                   "Exit with error: {}".format(e))
+            self._logger.log_and_raise_error(err)
+
+    def configure_cross_validation_from_module(self,
+                                               module_path: str,
+                                               name: str):
+        """
         :param str module_path: path to python module
         :param str module_path: path to python module
-            where the space is defined. Optional when
-            the space is provided directly.
+            where the cross_validation function is defined.
 
 
-        :param str name: name of the space loaded from
-            a python module. Optional when the space
-            is provided directly.
+        :param str name: name of the cross validation function
+            loaded froma python module.
         """
         """
         try:
         try:
-            assert((cross_validator is not None) or
-                   ((module_path is not None) and (name is not None)))
+            assert(isinstance(module_path, str) and
+                   isinstance(name, str))
         except AssertionError:
         except AssertionError:
-            err = ("Either cross_validator or "
-                   "(module_path, name) must be provided")
-            self._logger.log_and_raise_error(err)
+            err = "Parameters module_path and name must be of str type"
+            self._logger.log_and_raise_error(err, ErrorType=NameError)
 
 
-        self._cross_validator = cross_validator or\
-            LoadingUtils().load_from_module(module_path=module_path, name=name)
+        try:
+            self._cross_validation = \
+                LoadingUtils().load_from_module(
+                        module_path=module_path, name=name)
 
 
-        self._logger.info("Attached a cross validator")
-        self._cross_validator_attached = True
+            self.configured_cross_validation = True
 
 
-    def attach_space(self, space=None,
-                     module_path: str = None,
-                     name: str = None):
-        '''
+            self.best_result["cross_validation"] = name
+
+            self._logger.info("Configured cross validation")
+
+        except Exception as e:
+            err = ("Failed to load cross-validation from module. "
+                   "Exit with error: {}".format(e))
+            self._logger.log_and_raise_error(e)
+
+    def attach_space(self, space):
+        """
         :param space: space where
         :param space: space where
-            the search is performed. Optional when a space
-            is loaded from a python module. A space might be either
+            the search is performed. A space might be either
             a list of dictionaries or a hyperopt space object
             a list of dictionaries or a hyperopt space object
             the elements of which are dictionaries with keys:
             the elements of which are dictionaries with keys:
             name, pipeline, params
             name, pipeline, params
+        """
+        self._space = space
+        self._logger.info("Attached parameter distribution space")
+        self.attached_space = True
 
 
+    def attach_space_from_module(self, module_path: str, name: str):
+        """
         :param str module_path: path to python module
         :param str module_path: path to python module
-            where the space is defined. Optional when
-            the space is provided directly.
+            where the space is defined.
 
 
         :param str name: name of the space loaded from
         :param str name: name of the space loaded from
-            a python module. Optional when the space
-            is provided directly.
-        '''
+            a python module.
+        """
         try:
         try:
-            assert((space is not None) or
-                   ((module_path is not None) and (name is not None)))
+            assert(isinstance(module_path, str) and
+                   isinstance(name, str))
         except AssertionError:
         except AssertionError:
-            err = "Either space or (module_path, name) must be provided"
-            self._logger.log_and_raise_error(err)
+            err = "Parameters module_path and name must be of str type"
+            self._logger.log_and_raise_error(err, ErrorType=NameError)
 
 
-        self._space = space or LoadingUtils().load_from_module(
-                module_path=module_path, name=name)
+        try:
+            self._space = LoadingUtils().load_from_module(
+                    module_path=module_path, name=name)
 
 
-        self._logger.info("Attached parameter distribution space")
-        self._space_attached = True
+            self._logger.info("Attached parameter distribution space")
+
+            self.attached_space = True
+        except Exception as e:
+            err = ("Failed to attach space from module. "
+                   "Exit with error {}".format(e))
+            self._logger.loger_and_raise_error(err)
 
 
     def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
     def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
                     y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
                     y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
@@ -242,25 +353,28 @@ class PipelineSelector(ABC):
                        isinstance(y_train, (pd.Series, np.ndarray,
                        isinstance(y_train, (pd.Series, np.ndarray,
                                             pd.DataFrame, NoneType)) and
                                             pd.DataFrame, NoneType)) and
                        isinstance(y_val, (pd.Series, np.ndarray)) and
                        isinstance(y_val, (pd.Series, np.ndarray)) and
-                       ((y_val is None) if (y_train is None)
-                        else (y_val is not None)))
+                       (y_val is None) == (y_train is None))
             except AssertionError:
             except AssertionError:
                 self._logger.log_and_raise_error(input_err)
                 self._logger.log_and_raise_error(input_err)
 
 
-            # cost is evaluated with a cross validation function
-            # that accepts an array and a cv object with
-            # indices of the fold splits.
-            # Here we create a trivial cv object
-            # with one validation split.
-
-            train_inds = list(range(len(X_train)))
-            val_inds = list(range(len(X_train),
-                                  len(X_train) + len(X_val)))
-
-            self._cv = [(train_inds, val_inds)]
-            self._X = np.concatenate([X_train, X_val])
-            self._y = None if y_train is None\
-                else np.concatenate([y_train, y_val])
+            try:
+                # cost is evaluated with a cross validation function
+                # that accepts an array and a cv object with
+                # indices of the fold splits.
+                # Here we create a trivial cv object
+                # with one validation split.
+
+                train_inds = list(range(len(X_train)))
+                val_inds = list(range(len(X_train),
+                                      len(X_train) + len(X_val)))
+
+                self._cv = [(train_inds, val_inds)]
+                self._X = np.concatenate([X_train, X_val])
+                self._y = None if y_train is None\
+                    else np.concatenate([y_train, y_val])
+            except Exception as e:
+                err = "Failed to attach data. Exit with error: {}".format(e)
+                self._logger.log_and_raise_error(err)
 
 
         else:
         else:
             try:
             try:
@@ -276,10 +390,139 @@ class PipelineSelector(ABC):
             self._y = y_train
             self._y = y_train
 
 
         self._logger.info("Attached data")
         self._logger.info("Attached data")
-        self._data_attached = True
+        self.attached_data = True
 
 
-    def _evaluate(self, pipeline: Pipeline) -> dict:
-        '''
+    def attach_data_from_hdf5(self,
+                              data_hdf5_store_path: str,
+                              cv_pickle_path: str = None):
+        """
+        Method for attaching data from a hdf5 store.
+             The hdf5 store is a binary file,
+             after loading it, it is a dictionary with keys
+             X_train (y_train, X_val, y_val). The cv is loaded
+             from a pickle file. The reason to separate the data
+             store from the cv store, is the hdf5 is optimized to
+             store large dataframes (especially with simple types) and
+             a a small list of lists like a cv-object is better
+             to be stored as a pickle file.
+        :param str data_hdf5_store_path: path to the hdf5 store
+            with train and validation data
+        :param str cv_pickle_path: path to the pickle file with
+            the cv data
+        """
+        try:
+            assert(os.path.isfile(data_hdf5_store_path))
+        except AssertionError:
+            err = "Parameter hdf5_store_path is not a file"
+            self._logger.log_and_raise_error(err, ErrorType=NameError)
+
+        # load the hdf5 store
+        try:
+            store = pd.HDFStore(data_hdf5_store_path)
+            self._data_path = data_hdf5_store_path
+        except Exception as e:
+            err = "Could not load the hdf5 store. Exit with error: {}."\
+                .format(e)
+            self._logger.log_and_raise_error(err)
+
+        data_input = {}
+
+        for key in ["/X_train", "/y_train", "/X_val", "/y_val"]:
+            if key not in store.keys():
+                data_input[key.replace("/", "")] = None
+            else:
+                data_input[key.replace("/", "")] = store[key]
+
+        if cv_pickle_path is not None:
+            try:
+                assert(os.path.isfile(cv_pickle_path))
+            except AssertionError:
+                err = "Parameter hdf5_store_path is not a file"
+                self._logger.log_and_raise_error(err, ErrorType=NameError)
+
+            try:
+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
+                self._cv_path = cv_pickle_path
+            except Exception as e:
+                err = "Could not load the pickeled cv. Exit with error: {}."\
+                    .format(e)
+                self._logger.log_and_raise_error(err)
+        else:
+            data_input["cv"] = None
+
+        self.attach_data(**data_input)
+
+        store.close()
+
+    def configer_summary_saving(self,
+                                save_method: Callable = None,
+                                kwargs: dict = None):
+        """
+        Attaching a method for saving information about
+             the trials/space/strategy and the result of
+             the current best pipeline. This method can
+             save the result in a txt or a json file,
+             or in a database for example. Arguments like
+             file path or the table name can be specified in kwargs.
+        :param Callable save_method: method for saving the result
+            of the pipeline selection. The method must accept
+            a pandas DataFrame as argument. See self._save_result
+            method for the format of the argument being saved.
+            By default, saving to a csv file.
+            Examples:
+                functools.partial(pd.DataFrame.to_csv,
+                                  **{"path_or_buf": <PATH>})
+                functools.partial(np.savetxt, **{"fname": <PATH>})
+
+                functools.partial(SQLHandler(<URI>).append_to_table,
+                                  **{"tablename": <NAME>})
+
+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
+                                  **{"collection_name": <NAME>})
+
+            using functools can be avoided by providing the kwarg argument
+        :param dict kwargs: a dictionary with keyword arguments
+            (like tablename) to provide to the save_method
+        """
+        try:
+            save_method = save_method or functools.partial(
+                    pd.DataFrame.to_excel, **{"path_or_buf": "result.csv"})
+
+            kwargs = kwargs or {}
+
+            self._save_method = functools.partial(save_method, **kwargs)
+
+            self.configured_summary_saving = True
+
+            self._logger.info("Configured summary saving")
+
+        except Exception as e:
+            err = ("Failed to configure the summary saving. "
+                   "Exit with error {}".format(e))
+            self._logger.log_and_raise_error(err)
+
+    def _save_summary(self, summary: dict):
+        """
+        """
+        try:
+            assert(self.configured_summary_saving)
+        except AssertionError:
+            err = "Result saving must be configured first"
+            self._logger.log_and_raise_error(err, ErrorType=AssertionError)
+
+        try:
+            self._save_method(summary)
+
+        except Exception as e:
+            err = ("Could not configure summary saving. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _evaluate(self, pipeline: Pipeline,
+                  scoring: Callable = None,
+                  cross_validation: Callable = None) -> dict:
+        """
         This method is called in _objective.
         This method is called in _objective.
 
 
         Calculates the cost on the attached data.
         Calculates the cost on the attached data.
@@ -289,24 +532,46 @@ class PipelineSelector(ABC):
 
 
         :param Pipeline pipeline: machine learning pipeline
         :param Pipeline pipeline: machine learning pipeline
             that will be evaluated with cross-validation
             that will be evaluated with cross-validation
+        :param cross_validation: a function that has the same
+             signature as sklearn.model_selection.cross_validate
 
 
-        :output: dictionary with the aggregated
+        :return: dictionary with the aggregated
             cross-validation score and
             cross-validation score and
             the score variance.
             the score variance.
-        '''
-        if not self._cross_validator_attached:
-            self._cross_validator = sklearn_cross_validator
+        """
+        try:
+
+            scoring = {"score": make_scorer(self._cost_func)}
+
+            scoring.update({metric_name: make_scorer(metric)
+                            for metric_name, metric
+                            in self._additional_metrics.items()})
+
+            scores = self._cross_validation(
+                    estimator=pipeline,
+                    X=self._X,
+                    y=self._y,
+                    cv=self._cv or 5,
+                    scoring=scoring,
+                    error_score=np.nan)
+
+            scores_average = {
+                    metric_name.replace("test_", ""):
+                    self._cross_val_averaging_func(scores[metric_name])
+                    for metric_name in scores
+                    if metric_name.startswith("test")}
+
+            scores_variance = {
+                    metric_name.replace("test_", "") + "_variance":
+                    np.var(scores[metric_name])
+                    for metric_name in scores
+                    if metric_name.startswith("test")}
 
 
-        scores = self._cross_validator(
-                estimator=pipeline,
-                X=self._X,
-                y=self._y,
-                cv=self._cv or 5,
-                scoring=make_scorer(self._cost_func),
-                error_score=np.nan)
+            return {**scores_average, **scores_variance}
 
 
-        return {'value': self._cross_val_averaging_func(scores['test_score']),
-                'variance': np.var(scores['test_score'])}
+        except Exception as e:
+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
 
 
     def _objective(self, space_element: dict) -> dict:
     def _objective(self, space_element: dict) -> dict:
         '''
         '''
@@ -354,79 +619,137 @@ class PipelineSelector(ABC):
 
 
         start_time = time.time()
         start_time = time.time()
 
 
-        if not self._data_attached:
+        try:
+            assert(self.attached_data)
+        except AssertionError:
             err = ("Data must be attached in order "
             err = ("Data must be attached in order "
                    "in order to effectuate the best"
                    "in order to effectuate the best"
                    "pipeline search")
                    "pipeline search")
             self._logger.log_and_raise_error(err)
             self._logger.log_and_raise_error(err)
 
 
-        self._run_number += 1
+        summary = {}
+
+        if self._strategy_name is not None:
+            summary["strategy_name"] = self._strategy_name
+
+        if isinstance(self._cost_func, str):
+            summary["cost_func"] = self._cost_func
+
+        elif hasattr(self._cost_func, "__name__"):
+            summary["cost_func"] = self._cost_func.__name__
+
+        summary["trials_path"] = self.trials_path
+
+        if self._data_path is not None:
+            summary["data_path"] = self._data_path
+
+        if self._cv_path is not None:
+            summary["cv_path"] = self._cv_path
 
 
-        pipeline = space_element['pipeline']
-        params = space_element['params']
-        pipeline.set_params(**params)
+        summary["start_tuning_time"] = self.start_tuning_time
 
 
-        self._logger.info(("Run number {0}: "
-                           "Current score is {1}: "
-                           "Training pipeline {2} "
-                           "with parameters: {3}. ").format(
-                             self._run_number,
-                             self._best_score,
-                             space_element['name'],
-                             params))
+        summary["iteration"] = self._iteration
+
+        backup_cond = (self._backup_trials_freq is not None) and\
+            ((self._iteration - self._start_iteration - 1) %
+             self._backup_trials_freq == 0) or\
+            self._score_improved
+
+        if backup_cond:
+            self._backup_trials()
+            self._score_improved = False
 
 
         try:
         try:
+            pipeline = space_element['pipeline']
+            params = space_element['params']
+            pipeline.set_params(**params)
+
+            self._logger.info(("Iteration {0}: "
+                               "Current score is {1}: "
+                               "Training pipeline {2} "
+                               "with parameters: {3}. ").format(
+                                  self._iteration,
+                                  self.best_score,
+                                  space_element['name'],
+                                  params))
+
             result = self._evaluate(pipeline)
             result = self._evaluate(pipeline)
 
 
-            assert(not np.isnan(result["value"]))
+            summary.update(result)
 
 
-            if self._run_number % self._backup_trials_freq == 0:
-                self._backup_trials()
+            end_time = time.time()
 
 
-            if (self._best_score != self._best_score) or\
-                self._score_factor*result["value"] <\
-                    self._score_factor*self._best_score:
+            assert(not np.isnan(result["score"])),\
+                "Score value is not in the output of the _evaluate method"
 
 
-                self._logger.info("Score got better, new best score is: {}"
-                                  .format(result["value"]))
+            summary['status'] = STATUS_OK
+            summary.update(result)
+            summary['loss'] = self._score_factor * summary['score']
+            summary['timestamp'] = datetime.datetime.today()
+            summary['train_time'] = end_time - start_time
 
 
-                self._best_score = result['value']
+            self._iteration += 1
 
 
-            end_time = time.time()
+            self._score_improved = (self.best_score != self.best_score) or\
+                                   (self._score_factor*result["score"] <
+                                    self._score_factor*self.best_score)
 
 
-            return {'loss': self._score_factor * result["value"],
-                    'status': STATUS_OK,
-                    'score': result["value"],
-                    'score_variance': result["variance"],
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': end_time - start_time}
+            if self._score_improved:
+
+                self._logger.info("Score improved, new best score is: {}"
+                                  .format(result["score"]))
+
+                self.best_score = result['score']
+
+                if self.configured_summary_saving:
+                    self._save_summary(summary)
 
 
         except Exception as e:
         except Exception as e:
 
 
             self._logger.warning("Trial failed with error {}".format(e))
             self._logger.warning("Trial failed with error {}".format(e))
 
 
-            return {'loss': np.nan,
-                    'status': STATUS_FAIL,
-                    'score': np.nan,
-                    'score_variance': np.nan,
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': np.nan}
+            summary['status'] = STATUS_FAIL
+            summary['timestamp'] = datetime.datetime.today()
+            summary['error'] = e
+            for key in ['loss', 'score', 'score_variance', 'train_time']:
+                summary[key] = np.nan
+
+        return summary
 
 
     @abstractmethod
     @abstractmethod
     def run_trials(self):
     def run_trials(self):
         """
         """
+        Method that runs the hyperparameter tuning over possibly multiple
+        pipeline types specified in self.space
+        When run_trials method is finished the flag self.finished_tuning
+        should be set to True and the methods self._backup_trials and
+        optionally self._save_result should be called.
         """
         """
         pass
         pass
 
 
     @abstractproperty
     @abstractproperty
-    def best_trial(self) -> float:
+    def number_of_trials(self) -> int:
         """
         """
+        Number of trials already run in the current trials object
+        """
+        pass
+
+    @abstractproperty
+    def best_trial(self) -> dict:
+        """
+        Best trial sor far.
+         Should contain the best pipeline,
+         best hyperparameters,
+         as well as an output of the self._objective method,
+         but the exact form of the output depends on the implementation
+         of the Trials object.
         """
         """
         pass
         pass
 
 
     @abstractproperty
     @abstractproperty
     def best_trial_score(self) -> float:
     def best_trial_score(self) -> float:
         """
         """
+        Score of the best pipeline with the best hyperparameters
         """
         """
         pass
         pass
 
 
@@ -439,5 +762,34 @@ class PipelineSelector(ABC):
     @abstractproperty
     @abstractproperty
     def best_trial_pipeline(self) -> Pipeline:
     def best_trial_pipeline(self) -> Pipeline:
         """
         """
+        Best pipeline with best hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def get_n_best_trial_pipelines(self, n: int) -> list:
+        """
+        N best pipelines with corresponding
+        best hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
+        """
+        If the hyperparameter search is done over multiple
+        pipelines, then returns n different pipeline-types
+        with corresponding hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def trials_to_excel(self, path: str):
+        """
+        Trials object in the shape of table written to excel,
+        should contain the iteration, pipeline (as str),
+        hyperparamters (as str), self.best_result (see self._objective method)
+        as well as additional information configured
+        through self.save_result method.
         """
         """
         pass
         pass