123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Fri Nov 9 13:27:44 2018
- @author: tanja
- @description: Implementation of machine learning
- pipeline selection and tuning with hyperopt library
- """
- import os
- import sys
- import gc
- import logging
- import pickle
- import time
- import datetime
- import pandas as pd
- import numpy as np
- from sklearn.pipeline import Pipeline
- from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
- space_eval, pyll
- from sklearn.model_selection import cross_validate
- class HyperoptPipelineSelection:
- '''
- Use this class to perform a search
- for a machine learning pipeline in a given parameter space.
- The parameter space can include multiple types of Pipelines
- (SVM, XGBOOST, random forest, etc),
- as well as parameter distributions for each pipeline parameter.
- See example in main for the expected space structure.
- The search can be performed either randomly
- or with a tree-based algorithm. (Other methods are currently
- developped by hyperopt creators).
- Attribute trials is responsible for book-keeping parameter
- combinations that have already been tried out. This attribute
- is saved to a binary file every n minutes as well as every time
- a better pipeline was found.
- '''
- def __init__(self,
- cost_func,
- greater_is_better: bool,
- trials_path: str,
- backup_trials_freq: int = 1,
- log_path: str = None,
- averaging_func: callable = None):
- '''
- :param callable cost_func: function to minimize or maximize
- :param bool greater_is_better: when True
- cost_func is maximized, else minimized.
- :param str trials_path: path at which the trials object is saved
- in binary format. From the trials object we can
- select information about the obtained scores, score variations,
- and pipelines, and parameters tried out so far. If a trials object
- already exists at the given path, it is loaded and the
- search is continued, else, the search is started from
- the beginning.
- :param backup_trials_freq: frequecy in interations (trials)
- of saving the trials object at the trials_path.
- :param str log_path: Optional, when not provided logs to stdout.
- :param callable averaging_func: optional,
- when not provided set to mean. Function
- to aggregate the cross-validated values of the cost function.
- Classic situation is to take the mean,
- another example is, for example mean() - c*var().
- '''
- assert(callable(cost_func)),\
- "Parameter 'cost_func' must be a callable"
- assert(isinstance(greater_is_better, bool)),\
- "Parameter 'greater_is_better' must be bool type"
- assert(isinstance(trials_path, str)),\
- "Parameter 'trials_path' must be of string type"
- if averaging_func is not None:
- assert(callable(averaging_func)),\
- "Parameter 'averaging_func' must be a callable"
- self._assert_valid_directory(path=trials_path)
- self._configer_logger(log_path)
- self._cost_func = cost_func
- # is 1 when cost_func is minimized, -1 when cost func is maximized
- self._score_factor = (not greater_is_better) - greater_is_better
- self._trials_path = trials_path
- # is initialized with empty trials object
- self._trials = Trials()
- self._backup_trials_freq = backup_trials_freq
- self._averaging_func = averaging_func or np.mean
- # keeping track of the current search iteration
- self._run_number = 0
- # space and data need to be attached to perform search.
- self._space_attached = False
- self._data_attached = False
- # if a trials object already exists at the given path,
- # it is loaded and the search is continued. Else,
- # the search is started from the beginning.
- if os.path.isfile(trials_path):
- try:
- with open(trials_path, "rb") as f:
- self._trials = pickle.load(f)
- self._logger.info(("Loaded an existing trials object"
- "Consisting of {} trials")
- .format(len(self._trials.trials)))
- except Exception as e:
- self._logger.error(("Trials object could not be loaded. "
- "Training starts from the beginning. "
- "Exit with error {}").format(e))
- else:
- self._logger.info(("No existing trials object was found"
- "Initialized an empty trials object."))
- self._best_score = self.best_trial_score
- def _configer_logger(self, log_path: str = None):
- '''
- Can be replaced with the existing script later.
- When log_path is not provided, logs to stdout.
- '''
- self._logger = logging.getLogger(__name__)
- if (self._logger.hasHandlers()):
- self._logger.handlers.clear()
- if log_path is not None:
- assert(isinstance(log_path, str)),\
- "Parameter 'log_path' must be of string type"
- self._assert_valid_directory(log_path)
- handler = logging.FileHandler(log_path)
- else:
- handler = logging.StreamHandler(sys.stdout)
- formatter = logging.Formatter(
- '\n %(asctime)s %(levelname)s %(message)s')
- handler.setFormatter(formatter)
- self._logger.addHandler(handler)
- self._logger.setLevel("INFO")
- def _backup_trials(self):
- '''
- Pickles (Saves) the trials object.
- Used in a scheduler.
- '''
- with open(self._trials_path, "wb") as f:
- pickle.dump(self._trials, f)
- def _assert_valid_directory(self, path: str):
- '''
- If the directory of a path does not exist yet,
- creates it.
- '''
- assert(isinstance(path, str)),\
- "Parameter 'path' must of str type"
- dirname = os.path.dirname("path")
- if len(dirname) > 0:
- os.mkdir(dirname, exists_ok=True)
- def attach_space(self, space: pyll.base.Apply = None,
- module_path: str = None,
- name: str = None):
- '''
- :param pyll.base.Apply space: hyperopt space where
- the search is performed. Optional when a space
- is loaded from a python module.
- :param str module_path: path to python module
- where the space is defined. Optional when
- the space is provided directly.
- :param str name: name of the space loaded from
- a python module. Optional when the space
- is provided directly.
- '''
- assert((space is not None) or
- ((module_path is not None) and (name is not None))),\
- "Either space or (module_path, name) must be provided"
- if space is None:
- for p in ["modele_path", "name"]:
- assert(isinstance(p, str)),\
- "Parameter '{}' must be of str type".format(p)
- assert(os.path.isfile(module_path)),\
- "Parameter 'module_path' must be a valid file"
- module, extension = os.path.splitext(os.path.basename(module_path))
- assert(extension == ",py"),\
- "Parameter 'space' must be read from a python file"
- sys.path.insert(module_path)
- try:
- from module import name as space
- except ImportError:
- err = "Invalid space location or name"
- self._logger.error(err)
- raise Exception(err)
- assert(isinstance(space, pyll.base.Apply)),\
- "Parameter 'space' must be of hyperopt space type"
- self._space = space
- self._logger.info("Attached parameter distribution space")
- self._space_attached = True
- def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
- -> np.ndarray:
- '''
- Converts an DataFrame to an numpy array.
- '''
- if isinstance(x, np.ndarray):
- return x
- elif (isinstance(x, pd.core.frame.DataFrame))\
- or (isinstance(x, pd.core.series.Series)):
- return x.values
- else:
- e = 'The argument must be a numpy array or a pandas DataFrame'
- self._logger.critical(e)
- raise ValueError(e)
- def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
- y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
- X_val: (pd.DataFrame, np.ndarray) = None,
- y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
- cv: (list, int) = None):
- '''
- :param array X_train: data on which
- machine learning pipelines are trained
- :param array y_train: optional, vector with targets,
- (not all algorithms require a targets)
- :param array X_val: optional, validation data.
- When not provided, cross-validated value
- of the cost_func is calculated.
- :param array y_val: optional, validation targets
- :param list cv: list of tuples containing
- train and validation indices or an integer representing
- the number of folds for a random split of data
- during cross-validation
- example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
- '''
- X_train = self._convert_to_array(X_train)
- if y_train is not None:
- y_train = self._convert_to_array(y_train)
- if X_val is not None:
- if cv is not None:
- self._logger.warning(("Both validation set and cv object "
- "are set. Validation score will be "
- "calculated on the validation set!"))
- X_val = self._convert_to_array(X_val)
- train_inds = list(range(len(X_train)))
- val_inds = list(range(len(X_train),
- len(X_train) + len(X_val)))
- # cost is evaluated with a cross validation function
- # that accepts an array and a cv object with
- # indices of the fold splits.
- # Here we create a trivial cv object
- # with one validation split.
- self._cv = [(train_inds, val_inds)]
- self._X = np.concatenate([X_train, X_val])
- if y_train is not None:
- if y_val is None:
- err = "Argument y_val must be provided"
- self._logger.critical(err)
- raise ValueError(err)
- else:
- y_val = self._convert_to_array(y_val)
- self._y = np.concatenate([y_train, y_val])
- else:
- self._y = None
- else:
- if cv is None:
- self._logger.warning(("Neither validation set nor cv object "
- "are set. Validation score will be "
- "calculated on 5 randomly "
- "splitted folds."))
- self._X = X_train
- self._y = y_train
- self._cv = cv
- self._logger.info("Attached data")
- self._data_attached = True
- def _evaluate(self, pipeline: Pipeline) -> dict:
- '''
- This method is called in _objective.
- Calculates the cost on the attached data.
- This function can be overriden, when the cost
- needs to be calculated differently,
- for example with a tensorflow model.
- :param Pipeline pipeline: machine learning pipeline
- that will be evaluated with cross-validation
- :output: dictionary with the aggregated
- cross-validation score and
- the score variance.
- '''
- scores = cross_validate(estimator=pipeline,
- X=self._X,
- y=self._y,
- cv=self._cv or 5,
- scoring=make_scorer(self._cost_func),
- error_score=np.nan)
- return {'value': self._averaging_func(scores['test_score']),
- 'variance': np.var(scores['test_score'])}
- def _objective(self, space_element: dict) -> dict:
- '''
- This method is called in search_for_best_pipeline
- inside the hyperopt fmin method.
- Uses _evaluate method.
- It must take as input a space element
- and produce an output in the form of dictionary
- with 2 obligatory values loss and status
- values in the output are optional and can be
- accessed later through the trials object.
- :Warning: fmin minimizes the loss,
- when _evaluate returns a value to be maximized,
- it should be multiplied by -1 to obtain loss.
- :param dict space_element: must contain keys
- name (with the name of the pipeline),
- pipeline (Pipeline object),
- params (dict of pipeline params)
- :output: dictionary with keys
- loss (minimized value),
- status with values STATUS_OK or STATUS_FAIL
- uderstood by hyperopt,
- score (equal to loss or -loss),
- score_variance,
- timestamp (end of execution),
- train_time: execution time
- '''
- assert(isinstance(space_element, dict) and
- set(['name', 'pipeline', 'params']) <= space_element.keys())
- assert(isinstance(space_element['name'], str) and
- isinstance(space_element['pipeline'], Pipeline) and
- isinstance(space_element['params'], dict))
- start_time = time.time()
- if not self._data_attached:
- raise Exception(("Data must be attached in order "
- "in order to effectuate the best"
- "pipeline search"))
- self._run_number += 1
- pipeline = space_element['pipeline']
- params = space_element['params']
- pipeline.set_params(**params)
- self._logger.info(("Run number {0}: "
- "Current score is {1}: "
- "Training pipeline {2} "
- "with parameters: {3}. ").format(
- self._run_number,
- self._best_score,
- space_element['name'],
- params))
- try:
- score_stats = self._evaluate(pipeline)
- assert(not np.isnan(score_stats["value"])),\
- "Returned null score"
- if self._run_number % self._backup_trials_freq == 0:
- self._backup_trials()
- if (self._best_score != self._best_score) or\
- self._score_factor*score_stats["value"] <\
- self._score_factor*self._best_score:
- self._logger.info("Score got better, new best score is: {}"
- .format(score_stats["value"]))
- self._best_score = score_stats['value']
- self._backup_trials()
- end_time = time.time()
- return {'loss': self._score_factor * score_stats["value"],
- 'status': STATUS_OK,
- 'score': score_stats["value"],
- 'score_variance': score_stats["variance"],
- 'timestamp': datetime.datetime.today(),
- 'train_time': end_time - start_time}
- except Exception as e:
- self._logger.warning("Trial failed with error {}".format(e))
- return {'loss': np.nan,
- 'status': STATUS_FAIL,
- 'score': np.nan,
- 'score_variance': np.nan,
- 'timestamp': datetime.datetime.today(),
- 'train_time': np.nan}
- def search_for_best_pipeline(self,
- niter: int,
- algo: callable = tpe.suggest):
- '''
- Method performing the search of the best pipeline in the given space.
- Calls fmin function from the hyperopt library to minimize the output of
- _objective.
- :params int niter: number of search iterations
- :param callable algo: now can only take values tpe for a tree-based
- random search or random for random search
- '''
- assert(self._space_attached),\
- "Space must be attach to be able to retrieve this information."
- assert(isinstance(niter, int)),\
- "Parameter 'niter' must be of int type"
- # right now only two algorithms are provided by
- assert(algo in [tpe.suggest, rand.suggest]),\
- ("Parameter 'algo' can be now only tpe or random. "
- "If other algorithms have been developped by "
- "by hyperopt, plased add them to the list.")
- try:
- self._logger.info(("Starting {0} iterations of search "
- "additional to {1} previous"
- .format(niter, len(self._trials.trials))))
- best = fmin(fn=self._objective,
- space=space,
- algo=algo,
- trials=self._trials,
- max_evals=len(self._trials.trials) + niter)
- # print('AAAA', str(niter))
- self._logger.info(
- "Best score is {0} with variance {1}"
- .format(
- self._trials.best_trial["result"]["score"],
- self._trials.best_trial["result"]["score_variance"]))
- self._logger.info(("Finished {0} iterations of search.\n"
- "Best parameters are:\n {1} ")
- .format(niter,
- space_eval(space, best)))
- self._backup_trials()
- except Exception as e:
- raise ValueError(("Failed to select best "
- "pipeline! Exit with error: {}").format(e))
- @property
- def best_trial_score(self) -> float:
- '''
- '''
- if len(self._trials.trials) > 0:
- return self._trials.best_trial["result"]["score"]
- else:
- return np.nan
- @property
- def best_trial_score_variance(self) -> float:
- '''
- '''
- if len(self._trials.trials) > 0:
- return self._trials.best_trial["result"]["score_variance"]
- else:
- return np.nan
- @property
- def best_trial_pipeline(self) -> Pipeline:
- '''
- '''
- assert(self._space_attached),\
- "Space must be attach to be able to retrieve this information."
- if len(self._trials.trials) > 0:
- return space_eval(
- space,
- {k: v[0] for k, v in
- self._trials.best_trial['misc']['vals'].items()
- if len(v) > 0})["pipeline"]
- else:
- err = ("Trials object is empty. "
- "Best pipeline cannot be returned")
- self._logger.error(err)
- raise Exception(err)
- def _ith_trial_loss(self, i: int) -> float:
- '''
- '''
- if len(self._trials.trials) >= i:
- return self._trials.trials[i]['result']['loss']
- else:
- return np.nan
- def _ith_trial_element(self, i: int, name: str) -> object:
- '''
- '''
- assert(self._space_attached),\
- "Space must be attach to be able to retrieve this information."
- if len(self._trials.trials) >= i:
- return space_eval(self._space,
- {k: v[0] for k, v in
- self._trials.trials[i]['misc']['vals']
- .items() if len(v) > 0})[name]
- def _ith_trial_pipeline(self, i: int) -> Pipeline:
- '''
- '''
- return self._ith_trial_element(i=i, name='pipeline')
- def _ith_trial_name(self, i: int) -> str:
- '''
- '''
- return self._ith_trial_element(i=i, name='name')
- def _ith_trial_params(self, i: int) -> dict:
- '''
- '''
- return self._ith_trial_element(i=i, name='params')
- def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
- '''
- '''
- if len(self._trials.trials) >= i:
- return self._trials.trials[i]["result"]["timestamp"]
- def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
- '''
- Returns the list of n best pipelines
- documented in trials
- '''
- if len(self._trials.trials) > 0:
- if losses is None:
- losses = [self._ith_trial_loss(i)
- for i in range(len(self._trials.trials))]
- best_n_indices = [losses.index(l)
- for l in sorted(list(set(losses)))[:n]]
- return [self._ith_trial_pipeline(i) for i in best_n_indices]
- else:
- err = ("Trials object is empty. "
- "Best pipeline cannot be returned")
- self._logger.error(err)
- raise Exception(err)
- def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
- '''
- Returns a dictiionry where keys are pipeline names,
- and values are lists of best pipelines with this name
- '''
- assert(isinstance(n, int)), "Parameter 'n' must be an integer"
- if len(self._trials.trials) > 0:
- best_pipelines_per_type = {}
- names = [self._ith_trial_name(i)
- for i in range(len(self._trials.trials))]
- for nm in names:
- losses = [self._ith_trial_loss(i)
- for i in range(len(self._trials.trials))
- if self._ith_trial_name(i) == nm]
- best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
- n=n,
- losses=losses)
- return best_pipelines_per_type
- else:
- err = ("Trials object is empty. "
- "Best pipeline cannot be returned")
- self._logger.error(err)
- raise Exception(err)
- def write_trials_documentation(self, path: str = None):
- '''
- Saves an excel file with pipeline names, scores,
- parameters, and timestamps.
- '''
- if len(self._trials.trials) > 0:
- path = path or "hyperopt_trials_documentation.xlsx"
- assert(isinstance(path, str)),\
- "Parameter 'path' must be of string type"
- self._assert_valid_directory(path)
- names = [self._ith_trial_name(i)
- for i in range(len(self._trials.trials))]
- scores = [self._score_factor*self._ith_trial_loss(i)
- for i in range(len(self._trials.trials))]
- params = [self._ith_trial_params(i)
- for i in range(len(self._trials.trials))]
- timestamps = [self._ith_trial_timestamp(i)
- for i in range(len(self._trials.trials))]
- else:
- names = []
- scores = []
- params = []
- timestamps = []
- pd.DataFrame({"name": names,
- "score": scores,
- "params": params,
- "timestamp": timestamps})\
- .to_excel(path)
- if __name__ == '__main__':
- from sklearn.metrics import roc_auc_score, make_scorer
- from xgboost import XGBClassifier
- from sklearn.svm import SVC
- from sklearn.feature_selection import SelectKBest
- from sklearn.decomposition import PCA
- from sklearn.datasets import load_iris
- from pprint import pprint
- data = load_iris()
- X = pd.DataFrame(data.data)
- y = pd.Series(data.target)
- # produce a binory variable
- y = (y == 2).astype(int)
- del data
- gc.collect()
- # SPACE DEFINITION ########################################
- # (can be moved to a separate python script)
- """
- A search space must be a list of dictionaries.
- Each dictionry must have keys:
- name (pipeline name or type),
- pipeline (instance of sklearn.pipeline.Pipeline),
- params (dictionary of distributions for the parameters of
- the pipeline that we want to tune)
- Here we have a space that consists of two dictionaries:
- """
- space = []
- pipeline_dist_1 = {}
- pipeline_dist_1["name"] = "KBEST_XGBOOST"
- """
- A pipeline consists of steps (tuples).
- Each step has a name and an algorithm.
- This pipeline, as a first step performs
- feature selection with SelectKBest and
- as a second step evaluates a machine learning algo (xgboost).
- Like all sklearn algorithms, a Pipeline has methods
- fit, predict, set_params, get_params
- """
- pipeline_dist_1["pipeline"] = Pipeline([
- ('kbest', SelectKBest()),
- ('xgb', XGBClassifier())
- ])
- """
- Pipeline parameter dictionaries must be of the form:
- {'kbest__k': 3, xgb__n_estimators: 20},
- each parameter name consists of the step name, __, and parameter name.
- Here, instead of values, the parameter names are followed
- by hyperopt distributions.
- Each hyperopt distribution also must have a name,
- due to hyperopt functionality.
- Here, we set the hyperopt distribution name to the step name,
- but it does not have to be so. Hyperopt distribution names
- must be different for different elements of the space.
- """
- pipeline_dist_1["params"] = {
- 'kbest__k': hp.choice('kbest__k', range(1, 5)),
- 'xgb__n_estimators':
- 50 + hp.randint('xgb__n_estimators', 50),
- "xgb__learning_rate":
- hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
- }
- space.append(pipeline_dist_1)
- pipeline_dist_2 = {}
- pipeline_dist_2["name"] = "PCA_SVC"
- pipeline_dist_2["pipeline"] = Pipeline([
- ('pca', PCA()),
- ('svc', SVC(gamma="scale"))
- ])
- pipeline_dist_2["params"] = {
- "pca__n_components": 1 + hp.randint("pca__n_components", 4),
- "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
- }
- space.append(pipeline_dist_2)
- space = hp.choice('pipelines', space)
- # TESTING ##########################################################
- trials_path = 'TEST_hyperopt_trials.pkl'
- doc_path = 'TEST_hyperopt_doc.xlsx'
- hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
- greater_is_better=True,
- trials_path=trials_path)
- hp_obj.attach_data(X_train=X, y_train=y)
- hp_obj.attach_space(space=space)
- hp_obj.search_for_best_pipeline(niter=10)
- print('\n', '='*20, 'TESTING', '='*20)
- print('\n', 'Best score:', hp_obj.best_trial_score)
- print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
- print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
- print('\n', 'Best 3 pipelines: \n')
- pprint(hp_obj.get_n_best_trial_pipelines(n=3))
- print('\n', 'Best pipeline per type: \n')
- pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
- hp_obj.write_trials_documentation(path=doc_path)
- # os.remove(doc_path)
- # os.remove(trials_path)