|
@@ -4,6 +4,13 @@
|
|
Created on Wed Sep 30 14:23:23 2020
|
|
Created on Wed Sep 30 14:23:23 2020
|
|
|
|
|
|
@author: tanya
|
|
@author: tanya
|
|
|
|
+@description: an abstract class for selecting a machine learning
|
|
|
|
+ pipeline in a space of parameter distributions over multiple pipelines.
|
|
|
|
+ The selection is though in such a way that a Trials object is being
|
|
|
|
+ maintained during the tuning process from which one can retrieve
|
|
|
|
+ the best pipeline so far as well as the entire tuning history
|
|
|
|
+ if needed.
|
|
|
|
+ Children classes: hyperopt and custom gridsearch.
|
|
"""
|
|
"""
|
|
|
|
|
|
import pickle
|
|
import pickle
|
|
@@ -14,7 +21,7 @@ import datetime
|
|
from typing import Callable
|
|
from typing import Callable
|
|
import numpy as np
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pandas as pd
|
|
-from abc import ABC, abstractmethod
|
|
|
|
|
|
+from abc import ABC, abstractmethod, abstractproperty
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.model_selection import cross_validate as sklearn_cross_validator
|
|
from sklearn.model_selection import cross_validate as sklearn_cross_validator
|
|
from sklearn.metrics import make_scorer
|
|
from sklearn.metrics import make_scorer
|
|
@@ -28,14 +35,19 @@ sys.path.append(os.getcwd())
|
|
|
|
|
|
class PipelineSelector(ABC):
|
|
class PipelineSelector(ABC):
|
|
"""
|
|
"""
|
|
|
|
+ An abstract class for selecting a machine learning
|
|
|
|
+ pipeline in a space of parameter distributions over multiple pipelines.
|
|
|
|
+ The selection is though in such a way that a Trials object is being
|
|
|
|
+ maintained during the tuning process from which one can retrieve
|
|
|
|
+ the best pipeline so far as well as the entire tuning history
|
|
|
|
+ if needed.
|
|
"""
|
|
"""
|
|
def __init__(self,
|
|
def __init__(self,
|
|
cost_func,
|
|
cost_func,
|
|
greater_is_better: bool,
|
|
greater_is_better: bool,
|
|
trials_path: str,
|
|
trials_path: str,
|
|
backup_trials_freq: int = 1,
|
|
backup_trials_freq: int = 1,
|
|
- log_path: str = None,
|
|
|
|
- averaging_func: callable = None):
|
|
|
|
|
|
+ cross_val_averaging_func: callable = None):
|
|
'''
|
|
'''
|
|
:param callable cost_func: function to minimize or maximize
|
|
:param callable cost_func: function to minimize or maximize
|
|
|
|
|
|
@@ -55,27 +67,31 @@ class PipelineSelector(ABC):
|
|
|
|
|
|
:param str log_path: Optional, when not provided logs to stdout.
|
|
:param str log_path: Optional, when not provided logs to stdout.
|
|
|
|
|
|
- :param callable averaging_func: optional,
|
|
|
|
|
|
+ :param callable cross_val_averaging_func: optional,
|
|
when not provided set to mean. Function
|
|
when not provided set to mean. Function
|
|
to aggregate the cross-validated values of the cost function.
|
|
to aggregate the cross-validated values of the cost function.
|
|
Classic situation is to take the mean,
|
|
Classic situation is to take the mean,
|
|
another example is, for example mean() - c*var().
|
|
another example is, for example mean() - c*var().
|
|
'''
|
|
'''
|
|
-
|
|
|
|
- assert(callable(cost_func)),\
|
|
|
|
- "Parameter 'cost_func' must be a callable"
|
|
|
|
-
|
|
|
|
- assert(isinstance(greater_is_better, bool)),\
|
|
|
|
- "Parameter 'greater_is_better' must be bool type"
|
|
|
|
-
|
|
|
|
- assert(isinstance(trials_path, str)),\
|
|
|
|
- "Parameter 'trials_path' must be of string type"
|
|
|
|
-
|
|
|
|
- if averaging_func is not None:
|
|
|
|
- assert(callable(averaging_func)),\
|
|
|
|
- "Parameter 'averaging_func' must be a callable"
|
|
|
|
-
|
|
|
|
- self._logger = Log("PipelineSelector")
|
|
|
|
|
|
+ self._logger = Log("PipelineSelector: ")
|
|
|
|
+
|
|
|
|
+ input_errors = [(cost_func, Callable,
|
|
|
|
+ "Parameter 'cost_func' must be a callable"),
|
|
|
|
+ (greater_is_better, bool,
|
|
|
|
+ "Parameter 'greater_is_better' must be bool type"),
|
|
|
|
+ (trials_path, str,
|
|
|
|
+ "Parameter 'trials_path' must be of string type"),
|
|
|
|
+ (cross_val_averaging_func, (Callable, None.__class__),
|
|
|
|
+ ("Parameter 'cross_val_averaging_func'"
|
|
|
|
+ "must be a callable")),
|
|
|
|
+ (backup_trials_freq, int,
|
|
|
|
+ "Parameter backup_trials_freq must be an int")]
|
|
|
|
+
|
|
|
|
+ for p, t, err in input_errors:
|
|
|
|
+ try:
|
|
|
|
+ assert(isinstance(p, t))
|
|
|
|
+ except AssertionError:
|
|
|
|
+ self._logger.log_and_raise_error(err)
|
|
|
|
|
|
ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
|
|
ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
|
|
|
|
|
|
@@ -83,16 +99,18 @@ class PipelineSelector(ABC):
|
|
# is 1 when cost_func is minimized, -1 when cost func is maximized
|
|
# is 1 when cost_func is minimized, -1 when cost func is maximized
|
|
self._score_factor = (not greater_is_better) - greater_is_better
|
|
self._score_factor = (not greater_is_better) - greater_is_better
|
|
self._trials_path = trials_path
|
|
self._trials_path = trials_path
|
|
- # is initialized with empty trials object
|
|
|
|
- self._trials = None
|
|
|
|
self._backup_trials_freq = backup_trials_freq
|
|
self._backup_trials_freq = backup_trials_freq
|
|
- self._averaging_func = averaging_func or np.mean
|
|
|
|
|
|
+ self._cross_val_averaging_func = cross_val_averaging_func or np.mean
|
|
# keeping track of the current search iteration
|
|
# keeping track of the current search iteration
|
|
self._run_number = 0
|
|
self._run_number = 0
|
|
# space and data need to be attached to perform search.
|
|
# space and data need to be attached to perform search.
|
|
self._space_attached = False
|
|
self._space_attached = False
|
|
self._data_attached = False
|
|
self._data_attached = False
|
|
self._cross_validator_attached = False
|
|
self._cross_validator_attached = False
|
|
|
|
+ # _best_score is the same as best_trial_score property
|
|
|
|
+ # but is defined in order not to go through all the trials
|
|
|
|
+ # at each iteration.
|
|
|
|
+ self._best_score = np.nan
|
|
|
|
|
|
# if a trials object already exists at the given path,
|
|
# if a trials object already exists at the given path,
|
|
# it is loaded and the search is continued. Else,
|
|
# it is loaded and the search is continued. Else,
|
|
@@ -102,20 +120,23 @@ class PipelineSelector(ABC):
|
|
with open(trials_path, "rb") as f:
|
|
with open(trials_path, "rb") as f:
|
|
self._trials = pickle.load(f)
|
|
self._trials = pickle.load(f)
|
|
|
|
|
|
|
|
+ self._best_score = self.best_trial_score
|
|
|
|
+
|
|
self._logger.info(("Loaded an existing trials object"
|
|
self._logger.info(("Loaded an existing trials object"
|
|
"Consisting of {} trials")
|
|
"Consisting of {} trials")
|
|
.format(len(self._trials.trials)))
|
|
.format(len(self._trials.trials)))
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
- self._logger.error(("Trials object could not be loaded. "
|
|
|
|
- "Training starts from the beginning. "
|
|
|
|
- "Exit with error {}").format(e))
|
|
|
|
|
|
+ err = ("Trials object could not be loaded. "
|
|
|
|
+ "Exit with error {}").format(e)
|
|
|
|
+ self._logger.log_and_raise_error(err)
|
|
|
|
+ self._trials = None
|
|
|
|
|
|
else:
|
|
else:
|
|
- self._logger.info(("No existing trials object was found"
|
|
|
|
- "Initialized an empty trials object."))
|
|
|
|
|
|
+ self._logger.warning(("No existing trials object was found, "
|
|
|
|
+ "Starting from scratch."))
|
|
|
|
|
|
- self._best_score = self.best_trial_score
|
|
|
|
|
|
+ self._trials = None
|
|
|
|
|
|
def _backup_trials(self):
|
|
def _backup_trials(self):
|
|
'''
|
|
'''
|
|
@@ -140,9 +161,13 @@ class PipelineSelector(ABC):
|
|
a python module. Optional when the space
|
|
a python module. Optional when the space
|
|
is provided directly.
|
|
is provided directly.
|
|
"""
|
|
"""
|
|
- assert((cross_validator is not None) or
|
|
|
|
- ((module_path is not None) and (name is not None))),\
|
|
|
|
- "Either space or (module_path, name) must be provided"
|
|
|
|
|
|
+ try:
|
|
|
|
+ assert((cross_validator is not None) or
|
|
|
|
+ ((module_path is not None) and (name is not None)))
|
|
|
|
+ except AssertionError:
|
|
|
|
+ err = ("Either cross_validator or "
|
|
|
|
+ "(module_path, name) must be provided")
|
|
|
|
+ self._logger.log_and_raise_error(err)
|
|
|
|
|
|
self._cross_validator = cross_validator or\
|
|
self._cross_validator = cross_validator or\
|
|
LoadingUtils().load_from_module(module_path=module_path, name=name)
|
|
LoadingUtils().load_from_module(module_path=module_path, name=name)
|
|
@@ -169,9 +194,12 @@ class PipelineSelector(ABC):
|
|
a python module. Optional when the space
|
|
a python module. Optional when the space
|
|
is provided directly.
|
|
is provided directly.
|
|
'''
|
|
'''
|
|
- assert((space is not None) or
|
|
|
|
- ((module_path is not None) and (name is not None))),\
|
|
|
|
- "Either space or (module_path, name) must be provided"
|
|
|
|
|
|
+ try:
|
|
|
|
+ assert((space is not None) or
|
|
|
|
+ ((module_path is not None) and (name is not None)))
|
|
|
|
+ except AssertionError:
|
|
|
|
+ err = "Either space or (module_path, name) must be provided"
|
|
|
|
+ self._logger.log_and_raise_error(err)
|
|
|
|
|
|
self._space = space or LoadingUtils().load_from_module(
|
|
self._space = space or LoadingUtils().load_from_module(
|
|
module_path=module_path, name=name)
|
|
module_path=module_path, name=name)
|
|
@@ -214,7 +242,8 @@ class PipelineSelector(ABC):
|
|
isinstance(y_train, (pd.Series, np.array,
|
|
isinstance(y_train, (pd.Series, np.array,
|
|
pd.DataFrame, NoneType)) and
|
|
pd.DataFrame, NoneType)) and
|
|
isinstance(y_val, (pd.Series, np.array)) and
|
|
isinstance(y_val, (pd.Series, np.array)) and
|
|
- (type(y_train) == type(y_val)))
|
|
|
|
|
|
+ ((y_val is None) if (y_train is None)
|
|
|
|
+ else (y_val is not None)))
|
|
except AssertionError:
|
|
except AssertionError:
|
|
self._logger.log_and_raise_error(input_err)
|
|
self._logger.log_and_raise_error(input_err)
|
|
|
|
|
|
@@ -276,7 +305,7 @@ class PipelineSelector(ABC):
|
|
scoring=make_scorer(self._cost_func),
|
|
scoring=make_scorer(self._cost_func),
|
|
error_score=np.nan)
|
|
error_score=np.nan)
|
|
|
|
|
|
- return {'value': self._averaging_func(scores['test_score']),
|
|
|
|
|
|
+ return {'value': self._cross_val_averaging_func(scores['test_score']),
|
|
'variance': np.var(scores['test_score'])}
|
|
'variance': np.var(scores['test_score'])}
|
|
|
|
|
|
def _objective(self, space_element: dict) -> dict:
|
|
def _objective(self, space_element: dict) -> dict:
|
|
@@ -388,26 +417,27 @@ class PipelineSelector(ABC):
|
|
"""
|
|
"""
|
|
"""
|
|
"""
|
|
pass
|
|
pass
|
|
- self._trials = self._trials or []
|
|
|
|
-
|
|
|
|
- finished_combinations = [trial["combination"]
|
|
|
|
- for trial in self._trials]
|
|
|
|
|
|
|
|
- for space_element in self._space:
|
|
|
|
- combination = [(trial["name"],
|
|
|
|
- [(k, trial["params"][k])
|
|
|
|
- for k in trial["params"]])
|
|
|
|
- for trial in self._trials]
|
|
|
|
-
|
|
|
|
- if combination not in finished_combinations:
|
|
|
|
-
|
|
|
|
- result = self._objective(space_element)
|
|
|
|
-
|
|
|
|
- pipeline = space_element["pipeline"].set_params(
|
|
|
|
- space_element["params"])
|
|
|
|
|
|
+ @abstractproperty
|
|
|
|
+ def best_trial(self) -> float:
|
|
|
|
+ """
|
|
|
|
+ """
|
|
|
|
+ pass
|
|
|
|
|
|
- self._trials.append({"combination": combination,
|
|
|
|
- "pipeline": pipeline,
|
|
|
|
- "result": result})
|
|
|
|
|
|
+ @abstractproperty
|
|
|
|
+ def best_trial_score(self) -> float:
|
|
|
|
+ """
|
|
|
|
+ """
|
|
|
|
+ pass
|
|
|
|
|
|
|
|
+ @abstractproperty
|
|
|
|
+ def best_trial_score_variance(self) -> float:
|
|
|
|
+ """
|
|
|
|
+ """
|
|
|
|
+ pass
|
|
|
|
|
|
|
|
+ @abstractproperty
|
|
|
|
+ def best_trial_pipeline(self) -> Pipeline:
|
|
|
|
+ """
|
|
|
|
+ """
|
|
|
|
+ pass
|