tanja
/
cdplib


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 30 14:23:23 2020

@author: tanya
@description: an abstract class for selecting a machine learning
 pipeline from a space (deterministic or random) of parameter distributions
 over multiple pipelines.
 The selection is thought in such a way that a Trials object is being
 maintained during the tuning process from which one can retrieve
 the best pipeline so far
 as well as the entire tuning history if needed.
 Methods configure_cross_validation and configure_result_saving
 allow to use a custom cross-validation method and
 save the current best result in a file or database during training.
 Children classes: hyperopt and custom gridsearch.
"""

import pickle
import os
import sys
import time
import datetime
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod, abstractproperty
from typing import Callable
import functools
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate as sklearn_cross_validation
from sklearn.metrics import make_scorer
from hyperopt import STATUS_OK, STATUS_FAIL
from cdplib.log import Log
from cdplib.utils import ExceptionsHandler
from cdplib.utils import LoadingUtils

sys.path.append(os.getcwd())


class PipelineSelector(ABC):
    """
    An abstract class for selecting a machine learning
    pipeline from a space (deterministic or random) of parameter
    distributions over multiple pipelines.
    The selection is though in such a way that a Trials object is being
    maintained during the tuning process from which one can retrieve
    the best pipeline so far as well as the entire tuning history
    if needed.
    Methods configure_cross_validation and configure_result_saving
    allow to use a custom cross-validation method and
    save the current best result in a file or database during training.
    Children classes: hyperopt and custom gridsearch.
    """
    def __init__(self,
                 cost_func: (Callable, str),
                 greater_is_better: bool,
                 trials_path: str,
                 backup_trials_freq: int = None,
                 cross_val_averaging_func: Callable = None,
                 additional_metrics: dict = None,
                 strategy_name: str = None,
                 stdout_log_level: str = "INFO"):
        """
        :param Callable cost_func: function to minimize or maximize

        :param bool greater_is_better: when True
            cost_func is maximized, else minimized.

        :param str trials_path: path at which the trials object is saved
            in binary format. From the trials object we can
            select information about the obtained scores, score variations,
            and pipelines, and parameters tried out so far. If a trials object
            already exists at the given path, it is loaded and the
            search is continued, else, the search is started from
            the beginning.

        :param backup_trials_freq: frequecy in interations (trials)
            of saving the trials object at the trials_path.
            if None, the trials object is backed up avery time
            the score improves.

        :param str log_path: Optional, when not provided logs to stdout.

        :param Callable cross_val_averaging_func: optional,
            when not provided set to mean. Function
            to aggregate the cross-validated values of the cost function.
            Classic situation is to take the mean,
            another example is, for example mean() - c*var().

        :param additional_metics: dict of additional metrics to save
            of the form {"metric_name": metric} where metric is a Callable.

        :param str strategy_name: a name might be asigned to the trials,
            a strategy is defined by the data set, cv object, cost function.
            When the strategy changes, one should start with new trials.

        :param str stdout_log_level: can be INFO, WARNING, ERROR
        """
        self._logger = Log("PipelineSelector: ",
                           stdout_log_level=stdout_log_level)

        input_errors = [(cost_func, Callable,
                         "Parameter 'cost_func' must be a Callable"),
                        (greater_is_better, bool,
                         "Parameter 'greater_is_better' must be bool type"),
                        (trials_path, str,
                         "Parameter 'trials_path' must be of string type"),
                        (cross_val_averaging_func, (Callable, None.__class__),
                         ("Parameter 'cross_val_averaging_func'"
                          "must be a Callable")),
                        (backup_trials_freq, (int, None.__class__),
                         "Parameter backup_trials_freq must be an int"),
                        (additional_metrics, (dict, None.__class__),
                         "Parameter additional_metrics must be a dict"),
                        (strategy_name, (str, None.__class__),
                         "Parameter strategy_name must be a str"),
                        (stdout_log_level, str,
                         "Parameter stdout_log_level must be a str")]

        for p, t, err in input_errors:
            try:
                assert(isinstance(p, t))
            except AssertionError:
                self._logger.log_and_raise_error(err, ErrorType=NameError)

        try:
            assert((additional_metrics is None) or
                   all([isinstance(metric, Callable)
                        for metric in additional_metrics.values()]))
        except AssertionError:
            err = "Metrics in additional_metrics must be Callables"
            self._logger.log_and_raise_error(err, ErrorType=NameError)

        ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)

        self._cost_func = cost_func
        # score factor is 1 when cost_func is minimized,
        # -1 when cost func is maximized
        self._score_factor = (not greater_is_better) - greater_is_better
        self.trials_path = trials_path
        self._backup_trials_freq = backup_trials_freq
        self._cross_val_averaging_func = cross_val_averaging_func or np.mean
        self._additional_metrics = additional_metrics or {}
        self._strategy_name = strategy_name
        self._data_path = None
        self._cv_path = None

        # best_score can be also read from trials
        # but is kept explicitely in order not to
        # search through the trials object every time
        # loss is the opposite of score
        self.best_score = np.nan

        self._cross_validation = sklearn_cross_validation

        # if a trials object already exists at the given path,
        # it is loaded and the search is continued. Else,
        # the search is started from the beginning.
        if os.path.isfile(self.trials_path):
            try:
                with open(self.trials_path, "rb") as f:
                    self._trials = pickle.load(f)

                self._start_iteration = self.number_of_trials

                self.best_score = self.best_trial_score

                self._logger.info(("Loaded an existing trials object"
                                   "Consisting of {} trials")
                                  .format(self._start_iteration))

            except Exception as e:
                err = ("Trials object could not be loaded. "
                       "Exit with error {}").format(e)
                self._logger.log_and_raise_error(err)
                self._trials = None

        else:
            self._logger.warning(("No existing trials object was found, "
                                  "Starting from scratch."))

            self._trials = None
            self._start_iteration = 0

        self.attached_space = False
        self.attached_data = False
        self.configured_cross_validation = False
        self.configured_summary_saving = False

        # keeping track of the current search iteration
        self._iteration = self._start_iteration
        self._score_improved = False

        self.start_tuning_time = datetime.datetime.today()
        self.end_tuning_time = None
        self.finished_tuning = False

    def _backup_trials(self):
        '''
        Pickles (Saves) the trials object.
        Used in a scheduler.
        '''
        try:
            with open(self.trials_path, "wb") as f:
                pickle.dump(self._trials, f)
        except Exception as e:
            err = "Could not backup trials. Exit with error: {}".format(e)
            self._logger.log_and_raise_error(err)

    def configure_cross_validation(self,
                                   cross_validation: Callable,
                                   kwargs: dict = None):
        """
        Method for attaching a custom cross-validation function
        :param cross_validation: a function that has the same
             signature as sklearn.model_selection.cross_validate
        """
        try:
            assert(isinstance(cross_validation, Callable))
        except AssertionError:
            err = "Parameter cross_validation must be a function"
            self._logger.log_and_raise_error(err, ErrorType=NameError)

        try:
            kwargs = kwargs or {}
            assert(isinstance(kwargs, dict))
        except AssertionError:
            err = "Paramter kwargs must be a dict"
            self._logger.log_and_raise_error(err, ErrorType=NameError)

        try:
            self._cross_validation = functools.partial(
                    self._cross_validation, **kwargs)

            self.configured_cross_validation = True

            if hasattr(cross_validation, "__name__"):
                self.best_result["cross_validation"] =\
                    cross_validation.__name__

            self._logger.info("Configured cross validation")

        except Exception as e:
            err = ("Failed to configure cross-validation. "
                   "Exit with error: {}".format(e))
            self._logger.log_and_raise_error(err)

    def configure_cross_validation_from_module(self,
                                               module_path: str,
                                               name: str):
        """
        :param str module_path: path to python module
            where the cross_validation function is defined.

        :param str name: name of the cross validation function
            loaded froma python module.
        """
        try:
            assert(isinstance(module_path, str) and
                   isinstance(name, str))
        except AssertionError:
            err = "Parameters module_path and name must be of str type"
            self._logger.log_and_raise_error(err, ErrorType=NameError)

        try:
            self._cross_validation = \
                LoadingUtils().load_from_module(
                        module_path=module_path, name=name)

            self.configured_cross_validation = True

            self.best_result["cross_validation"] = name

            self._logger.info("Configured cross validation")

        except Exception as e:
            err = ("Failed to load cross-validation from module. "
                   "Exit with error: {}".format(e))
            self._logger.log_and_raise_error(e)

    def attach_space(self, space):
        """
        :param space: space where
            the search is performed. A space might be either
            a list of dictionaries or a hyperopt space object
            the elements of which are dictionaries with keys:
            name, pipeline, params
        """
        self._space = space
        self._logger.info("Attached parameter distribution space")
        self.attached_space = True

    def attach_space_from_module(self, module_path: str, name: str):
        """
        :param str module_path: path to python module
            where the space is defined.

        :param str name: name of the space loaded from
            a python module.
        """
        try:
            assert(isinstance(module_path, str) and
                   isinstance(name, str))
        except AssertionError:
            err = "Parameters module_path and name must be of str type"
            self._logger.log_and_raise_error(err, ErrorType=NameError)

        try:
            self._space = LoadingUtils().load_from_module(
                    module_path=module_path, name=name)

            self._logger.info("Attached parameter distribution space")

            self.attached_space = True
        except Exception as e:
            err = ("Failed to attach space from module. "
                   "Exit with error {}".format(e))
            self._logger.loger_and_raise_error(err)

    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
                    X_val: (pd.DataFrame, np.ndarray) = None,
                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
                    cv: (list, int) = None):
        '''
        :param array X_train: data on which
            machine learning pipelines are trained

        :param array y_train: optional, vector with targets,
            (not all algorithms require a targets)

        :param array X_val: optional, validation data.
            When not provided, cross-validated value
            of the cost_func is calculated.

        :param array y_val: optional, validation targets

        :param list cv: list of tuples containing
            train and validation indices or an integer representing
            the number of folds for a random split of data
            during cross-validation
            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
        '''
        NoneType = None.__class__

        input_err = "Non-valid combination of train and val data types"

        if cv is None:
            try:
                assert(isinstance(X_train, (pd.DataFrame, np.ndarray)) and
                       isinstance(X_val, (pd.DataFrame, np.ndarray)) and
                       isinstance(y_train, (pd.Series, np.ndarray,
                                            pd.DataFrame, NoneType)) and
                       isinstance(y_val, (pd.Series, np.ndarray)) and
                       (y_val is None) == (y_train is None))
            except AssertionError:
                self._logger.log_and_raise_error(input_err)

            try:
                # cost is evaluated with a cross validation function
                # that accepts an array and a cv object with
                # indices of the fold splits.
                # Here we create a trivial cv object
                # with one validation split.

                train_inds = list(range(len(X_train)))
                val_inds = list(range(len(X_train),
                                      len(X_train) + len(X_val)))

                self._cv = [(train_inds, val_inds)]
                self._X = np.concatenate([X_train, X_val])
                self._y = None if y_train is None\
                    else np.concatenate([y_train, y_val])
            except Exception as e:
                err = "Failed to attach data. Exit with error: {}".format(e)
                self._logger.log_and_raise_error(err)

        else:
            try:
                assert(isinstance(X_train, (pd.DataFrame, np.ndarray)) and
                       isinstance(y_train, (pd.Series, np.ndarray,
                                            pd.DataFrame, NoneType)) and
                       (X_val is None) and (y_val is None))
            except AssertionError:
                self._logger.log_and_raise_error(input_err)

            self._cv = cv
            self._X = X_train
            self._y = y_train

        self._logger.info("Attached data")
        self.attached_data = True

    def attach_data_from_hdf5(self,
                              data_hdf5_store_path: str,
                              cv_pickle_path: str = None):
        """
        Method for attaching data from a hdf5 store.
             The hdf5 store is a binary file,
             after loading it, it is a dictionary with keys
             X_train (y_train, X_val, y_val). The cv is loaded
             from a pickle file. The reason to separate the data
             store from the cv store, is the hdf5 is optimized to
             store large dataframes (especially with simple types) and
             a a small list of lists like a cv-object is better
             to be stored as a pickle file.
        :param str data_hdf5_store_path: path to the hdf5 store
            with train and validation data
        :param str cv_pickle_path: path to the pickle file with
            the cv data
        """
        try:
            assert(os.path.isfile(data_hdf5_store_path))
        except AssertionError:
            err = "Parameter hdf5_store_path is not a file"
            self._logger.log_and_raise_error(err, ErrorType=NameError)

        # load the hdf5 store
        try:
            store = pd.HDFStore(data_hdf5_store_path)
            self._data_path = data_hdf5_store_path
        except Exception as e:
            err = "Could not load the hdf5 store. Exit with error: {}."\
                .format(e)
            self._logger.log_and_raise_error(err)

        data_input = {}

        for key in ["/X_train", "/y_train", "/X_val", "/y_val"]:
            if key not in store.keys():
                data_input[key.replace("/", "")] = None
            else:
                data_input[key.replace("/", "")] = store[key]

        if cv_pickle_path is not None:
            try:
                assert(os.path.isfile(cv_pickle_path))
            except AssertionError:
                err = "Parameter hdf5_store_path is not a file"
                self._logger.log_and_raise_error(err, ErrorType=NameError)

            try:
                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
                self._cv_path = cv_pickle_path
            except Exception as e:
                err = "Could not load the pickeled cv. Exit with error: {}."\
                    .format(e)
                self._logger.log_and_raise_error(err)
        else:
            data_input["cv"] = None

        self.attach_data(**data_input)

        store.close()

    def configer_summary_saving(self,
                                save_method: Callable = None,
                                kwargs: dict = None):
        """
        Attaching a method for saving information about
             the trials/space/strategy and the result of
             the current best pipeline. This method can
             save the result in a txt or a json file,
             or in a database for example. Arguments like
             file path or the table name can be specified in kwargs.
        :param Callable save_method: method for saving the result
            of the pipeline selection. The method must accept
            a pandas DataFrame as argument. See self._save_result
            method for the format of the argument being saved.
            By default, saving to a csv file.
            Examples:
                functools.partial(pd.DataFrame.to_csv,
                                  **{"path_or_buf": <PATH>})
                functools.partial(np.savetxt, **{"fname": <PATH>})

                functools.partial(SQLHandler(<URI>).append_to_table,
                                  **{"tablename": <NAME>})

                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
                                  **{"collection_name": <NAME>})

            using functools can be avoided by providing the kwarg argument
        :param dict kwargs: a dictionary with keyword arguments
            (like tablename) to provide to the save_method
        """
        try:
            save_method = save_method or functools.partial(
                    pd.DataFrame.to_excel, **{"path_or_buf": "result.csv"})

            kwargs = kwargs or {}

            self._save_method = functools.partial(save_method, **kwargs)

            self.configured_summary_saving = True

            self._logger.info("Configured summary saving")

        except Exception as e:
            err = ("Failed to configure the summary saving. "
                   "Exit with error {}".format(e))
            self._logger.log_and_raise_error(err)

    def _save_summary(self, summary: dict):
        """
        """
        try:
            assert(self.configured_summary_saving)
        except AssertionError:
            err = "Result saving must be configured first"
            self._logger.log_and_raise_error(err, ErrorType=AssertionError)

        try:
            self._save_method(summary)

        except Exception as e:
            err = ("Could not configure summary saving. "
                   "Exit with error: {}".format(e))

            self._logger.log_and_raise_error(err)

    def _evaluate(self, pipeline: Pipeline,
                  scoring: Callable = None,
                  cross_validation: Callable = None) -> dict:
        """
        This method is called in _objective.

        Calculates the cost on the attached data.
        This function can be overriden, when the cost
        needs to be calculated differently,
        for example with a tensorflow model.

        :param Pipeline pipeline: machine learning pipeline
            that will be evaluated with cross-validation
        :param cross_validation: a function that has the same
             signature as sklearn.model_selection.cross_validate

        :return: dictionary with the aggregated
            cross-validation score and
            the score variance.
        """
        try:

            scoring = {"score": make_scorer(self._cost_func)}

            scoring.update({metric_name: make_scorer(metric)
                            for metric_name, metric
                            in self._additional_metrics.items()})

            scores = self._cross_validation(
                    estimator=pipeline,
                    X=self._X,
                    y=self._y,
                    cv=self._cv or 5,
                    scoring=scoring,
                    error_score=np.nan)

            scores_average = {
                    metric_name.replace("test_", ""):
                    self._cross_val_averaging_func(scores[metric_name])
                    for metric_name in scores
                    if metric_name.startswith("test")}

            scores_variance = {
                    metric_name.replace("test_", "") + "_variance":
                    np.var(scores[metric_name])
                    for metric_name in scores
                    if metric_name.startswith("test")}

            return {**scores_average, **scores_variance}

        except Exception as e:
            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
            self._logger.log_and_raise_error(err)

    def _objective(self, space_element: dict) -> dict:
        '''
        This method is called in search_for_best_pipeline
        inside the hyperopt fmin method.

        Uses _evaluate method.

        It must take as input a space element
        and produce an output in the form of dictionary
        with 2 obligatory values loss and status
        (STATUS_OK or STATUS_FAIL). Other
        values in the output are optional and can be
        accessed later through the trials object.

        :Warning: fmin minimizes the loss,
        when _evaluate returns a value to be maximized,
        it should be multiplied by -1 to obtain loss.

        :param dict space_element: must contain keys
            name (with the name of the pipeline),
            pipeline (Pipeline object),
            params (dict of pipeline params)

        :output: dictionary with keys
            loss (minimized value),
            status with values STATUS_OK or STATUS_FAIL
            uderstood by hyperopt,
            score (equal to loss or -loss),
            score_variance,
            timestamp (end of execution),
            train_time: execution time
        '''
        try:
            assert(isinstance(space_element, dict) and
                   set(['name', 'pipeline', 'params']) <= space_element.keys())

            assert(isinstance(space_element['name'], str) and
                   isinstance(space_element['pipeline'], Pipeline) and
                   isinstance(space_element['params'], dict))

        except AssertionError:
            err = "Space elements are of wrong form"
            self._logger.log_and_raise_error(err)

        start_time = time.time()

        try:
            assert(self.attached_data)
        except AssertionError:
            err = ("Data must be attached in order "
                   "in order to effectuate the best"
                   "pipeline search")
            self._logger.log_and_raise_error(err)

        summary = {}

        if self._strategy_name is not None:
            summary["strategy_name"] = self._strategy_name

        if isinstance(self._cost_func, str):
            summary["cost_func"] = self._cost_func

        elif hasattr(self._cost_func, "__name__"):
            summary["cost_func"] = self._cost_func.__name__

        summary["trials_path"] = self.trials_path

        if self._data_path is not None:
            summary["data_path"] = self._data_path

        if self._cv_path is not None:
            summary["cv_path"] = self._cv_path

        summary["start_tuning_time"] = self.start_tuning_time

        summary["iteration"] = self._iteration

        backup_cond = (self._backup_trials_freq is not None) and\
            ((self._iteration - self._start_iteration - 1) %
             self._backup_trials_freq == 0) or\
            self._score_improved

        if backup_cond:
            self._backup_trials()
            self._score_improved = False

        try:
            pipeline = space_element['pipeline']
            params = space_element['params']
            pipeline.set_params(**params)

            self._logger.info(("Iteration {0}: "
                               "Current score is {1}: "
                               "Training pipeline {2} "
                               "with parameters: {3}. ").format(
                                  self._iteration,
                                  self.best_score,
                                  space_element['name'],
                                  params))

            result = self._evaluate(pipeline)

            summary.update(result)

            end_time = time.time()

            assert(not np.isnan(result["score"])),\
                "Score value is not in the output of the _evaluate method"

            summary['status'] = STATUS_OK
            summary.update(result)
            summary['loss'] = self._score_factor * summary['score']
            summary['timestamp'] = datetime.datetime.today()
            summary['train_time'] = end_time - start_time

            self._iteration += 1

            self._score_improved = (self.best_score != self.best_score) or\
                                   (self._score_factor*result["score"] <
                                    self._score_factor*self.best_score)

            if self._score_improved:

                self._logger.info("Score improved, new best score is: {}"
                                  .format(result["score"]))

                self.best_score = result['score']

                if self.configured_summary_saving:
                    self._save_summary(summary)

        except Exception as e:

            self._logger.warning("Trial failed with error {}".format(e))

            summary['status'] = STATUS_FAIL
            summary['timestamp'] = datetime.datetime.today()
            summary['error'] = e
            for key in ['loss', 'score', 'score_variance', 'train_time']:
                summary[key] = np.nan

        return summary

    @abstractmethod
    def run_trials(self):
        """
        Method that runs the hyperparameter tuning over possibly multiple
        pipeline types specified in self.space
        When run_trials method is finished the flag self.finished_tuning
        should be set to True and the methods self._backup_trials and
        optionally self._save_result should be called.
        """
        pass

    @abstractproperty
    def number_of_trials(self) -> int:
        """
        Number of trials already run in the current trials object
        """
        pass

    @abstractproperty
    def best_trial(self) -> dict:
        """
        Best trial sor far.
         Should contain the best pipeline,
         best hyperparameters,
         as well as an output of the self._objective method,
         but the exact form of the output depends on the implementation
         of the Trials object.
        """
        pass

    @abstractproperty
    def best_trial_score(self) -> float:
        """
        Score of the best pipeline with the best hyperparameters
        """
        pass

    @abstractproperty
    def best_trial_score_variance(self) -> float:
        """
        """
        pass

    @abstractproperty
    def best_trial_pipeline(self) -> Pipeline:
        """
        Best pipeline with best hyperparameters
        """
        pass

    @abstractmethod
    def get_n_best_trial_pipelines(self, n: int) -> list:
        """
        N best pipelines with corresponding
        best hyperparameters
        """
        pass

    @abstractmethod
    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
        """
        If the hyperparameter search is done over multiple
        pipelines, then returns n different pipeline-types
        with corresponding hyperparameters
        """
        pass

    @abstractmethod
    def trials_to_excel(self, path: str):
        """
        Trials object in the shape of table written to excel,
        should contain the iteration, pipeline (as str),
        hyperparamters (as str), self.best_result (see self._objective method)
        as well as additional information configured
        through self.save_result method.
        """
        pass