PipelineSelector.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Sep 30 14:23:23 2020
  5. @author: tanya
  6. @description: an abstract class for selecting a machine learning
  7. pipeline from a space (deterministic or random) of parameter distributions
  8. over multiple pipelines.
  9. The selection is thought in such a way that a Trials object is being
  10. maintained during the tuning process from which one can retrieve
  11. the best pipeline so far
  12. as well as the entire tuning history if needed.
  13. Methods configure_cross_validation and configure_result_saving
  14. allow to use a custom cross-validation method and
  15. save the current best result in a file or database during training.
  16. Children classes: hyperopt and custom gridsearch.
  17. """
  18. import pickle
  19. import os
  20. import sys
  21. import time
  22. import datetime
  23. import numpy as np
  24. import pandas as pd
  25. from copy import deepcopy
  26. from abc import ABC, abstractmethod, abstractproperty
  27. if (sys.version_info.major == 3) & (sys.version_info.minor >= 8):
  28. print("I have python version {}.{} and will import typing".format(sys.version_info.major, sys.version_info.minor))
  29. from typing import Callable, TypedDict,\
  30. Literal, Dict, Iterable, List, Tuple, Union
  31. else:
  32. # from typing_extensions import *
  33. print("I have python version {}.{} and will import typing_extensions".format(sys.version_info.major, sys.version_info.minor))
  34. from typing_extensions import Callable, TypedDict,\
  35. Literal, Dict, Iterable, List, Tuple, Union
  36. import functools
  37. from sklearn.pipeline import Pipeline
  38. from sklearn.model_selection import cross_validate as sklearn_cross_validation
  39. from sklearn.metrics import make_scorer
  40. from hyperopt import STATUS_OK, STATUS_FAIL
  41. from cdplib.log import Log
  42. from cdplib.utils.ExceptionsHandler import ExceptionsHandler
  43. from cdplib.utils import LoadingUtils
  44. from cdplib.ml_validation import CVComposer
  45. sys.path.append(os.getcwd())
  46. class SpaceElementType(TypedDict):
  47. name: str
  48. pipeline: Pipeline
  49. params: dict
  50. # TODO Tanya: add possibility to include confusion matrix in
  51. # additional metrics
  52. # check that cv object contains indices
  53. class PipelineSelector(ABC):
  54. """
  55. An abstract class for selecting a machine learning
  56. pipeline from a space (deterministic or random) of parameter
  57. distributions over multiple pipelines.
  58. The selection is though in such a way that a Trials object is being
  59. maintained during the tuning process from which one can retrieve
  60. the best pipeline so far as well as the entire tuning history
  61. if needed.
  62. Methods configure_cross_validation and configure_result_saving
  63. allow to use a custom cross-validation method and
  64. save the current best result in a file or database during training.
  65. Children classes: hyperopt and custom gridsearch.
  66. """
  67. def __init__(self,
  68. cost_func: Union[Callable, str],
  69. greater_is_better: bool,
  70. trials_path: str,
  71. backup_trials_freq: int = None,
  72. cross_validation_needs_scorer: bool = True,
  73. cross_val_averaging_func: Callable = np.mean,
  74. additional_metrics: Dict[str, Callable] = None,
  75. additional_averaging_funcs: Dict[str, Callable] = None,
  76. strategy_name: str = None,
  77. stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
  78. = "INFO"):
  79. """
  80. :param Callable cost_func: function to minimize or maximize
  81. over the elements of a given (pipeline/hyperparameter) space
  82. :param bool greater_is_better: when True
  83. cost_func is maximized, else minimized.
  84. :param str trials_path: path at which the trials object is saved
  85. in binary format. From the trials object we can
  86. select information about the obtained scores, score variations,
  87. and pipelines, and parameters tried out so far. If a trials object
  88. already exists at the given path, it is loaded and the
  89. search is continued, else, the search is started from scratch.
  90. :param backup_trials_freq: frequecy in interations (trials)
  91. of saving the trials object at the trials_path.
  92. if None, the trials object is backed up avery time
  93. the score improves.
  94. :param Callable cross_val_averaging_func: Function to aggregate
  95. the cross-validation scores of the cost_func.
  96. Example different from the mean: mean - c*var.
  97. :param additional_metics: dict of additional metrics to keep track of
  98. in the trials of the form {"metric_name": metric}.
  99. :param additional_averaging_funcs: functions used to aggregate
  100. the output of the cross_validate function.
  101. The output always contains the scores of the cost_func,
  102. additional_metrics (if it is not empty),
  103. but it can also contain additional information
  104. (like probability threshold for example)
  105. if different from cross_val_averaging_func.
  106. Of the form {"metric_name": averaging_func}
  107. Remark:
  108. :param str strategy_name:
  109. a strategy is defined by the data set (columns/features and rows),
  110. cv object, cost function.
  111. When the strategy changes, one must start with new trials.
  112. :param str stdout_log_level: can be INFO, WARNING, ERROR
  113. """
  114. self._logger = Log("PipelineSelector: ",
  115. stdout_log_level=stdout_log_level)
  116. try:
  117. ExceptionsHandler(self._logger)\
  118. .assert_is_directory(path=trials_path)
  119. self.attached_space = False
  120. self.attached_data = False
  121. self.configured_cross_validation = False
  122. self.configured_summary_saving = False
  123. self._cost_func = cost_func
  124. self._greater_is_better = greater_is_better
  125. # score factor is 1 when cost_func is minimized,
  126. # -1 when cost func is maximized
  127. self._score_factor = (not greater_is_better) - greater_is_better
  128. self._cross_val_averaging_func = cross_val_averaging_func
  129. self._additional_metrics = additional_metrics
  130. self._additional_averaging_funcs = additional_averaging_funcs or {}
  131. self.trials_path = trials_path
  132. self._backup_trials_freq = backup_trials_freq
  133. self._strategy_name = strategy_name
  134. self._data_path = None
  135. self._cv_path = None
  136. self._X = None
  137. self._y = None
  138. self._cv = None
  139. self._space = None
  140. # if cross-valition is not configured,
  141. # sklearn cross-validation method is taken by default
  142. self._cross_validation = sklearn_cross_validation
  143. self._cross_validation_needs_scorer = cross_validation_needs_scorer
  144. # if a trials object already exists at the given path,
  145. # it is loaded and the search is continued. Else,
  146. # the search is started from the beginning.
  147. if os.path.isfile(self.trials_path):
  148. with open(self.trials_path, "rb") as f:
  149. self._trials = pickle.load(f)
  150. if len(self._trials) == 0:
  151. self._trials = None
  152. else:
  153. self._trials = None
  154. if self._trials is not None:
  155. self._start_iteration = self.number_of_trials
  156. self.best_score = self.best_trial_score
  157. self._logger.info(("Loaded an existing trials object"
  158. "Consisting of {} trials")
  159. .format(self._start_iteration))
  160. else:
  161. self._logger.warning(("No existing trials object was found, "
  162. "Starting from scratch."))
  163. self._trials = None
  164. self._start_iteration = 0
  165. self.best_score = np.nan
  166. # keeping track of the current search iteration
  167. self._iteration = self._start_iteration
  168. self._score_improved = False
  169. self.start_tuning_time = datetime.datetime.today()
  170. self.total_tuning_time = None
  171. self.finished_tuning = False
  172. except Exception as e:
  173. err = ("Failed to initialize the class. "
  174. "Exit with error: {}".format(e))
  175. self._logger.log_and_raise_error(err)
  176. def _backup_trials(self) -> None:
  177. '''
  178. Pickles (Saves) the trials object in binary format.
  179. '''
  180. try:
  181. with open(self.trials_path, "wb") as f:
  182. pickle.dump(self._trials, f)
  183. except Exception as e:
  184. err = "Could not backup trials. Exit with error: {}".format(e)
  185. self._logger.log_and_raise_error(err)
  186. def configure_cross_validation(self,
  187. cross_validation: Callable,
  188. kwargs: dict = None) -> None:
  189. """
  190. Method for attaching a custom cross-validation function
  191. :param cross_validation: a function that has the same
  192. signature as sklearn.model_selection.cross_validate
  193. """
  194. try:
  195. kwargs = kwargs or {}
  196. self._cross_validation = functools.partial(
  197. cross_validation, **kwargs)
  198. self.configured_cross_validation = True
  199. self._logger.info("Configured cross validation")
  200. except Exception as e:
  201. err = ("Failed to configure cross-validation. "
  202. "Exit with error: {}".format(e))
  203. self._logger.log_and_raise_error(err)
  204. def configure_cross_validation_from_module(self,
  205. module_path: str,
  206. name: str) -> None:
  207. """
  208. Attaches a cross-validation funciton defined in
  209. a different python model. This function must have
  210. the same signature as sklearn.model_seclection.cross_validate
  211. :param str module_path: path to python module
  212. where the cross_validation function is defined.
  213. :param str name: name of the cross validation function
  214. loaded froma python module.
  215. """
  216. try:
  217. self._cross_validation = \
  218. LoadingUtils().load_from_module(
  219. module_path=module_path, name=name)
  220. self.configured_cross_validation = True
  221. self._logger.info("Configured cross validation")
  222. except Exception as e:
  223. err = ("Failed to load cross-validation from module. "
  224. "Exit with error: {}".format(e))
  225. self._logger.log_and_raise_error(err)
  226. def attach_space(self, space) -> None:
  227. """
  228. Method for attaching the pipeline/hyperparameter space
  229. over which the score_func is optimized.
  230. :param space: space where
  231. the search is performed. A space might be either
  232. a list of dictionaries or a hyperopt space object
  233. the elements of which are dictionaries with keys:
  234. name, pipeline, params
  235. """
  236. try:
  237. self._space = space
  238. self.attached_space = True
  239. self._logger.info("Attached parameter distribution space")
  240. except Exception as e:
  241. err = ("Failed to attach space. "
  242. "Exit with error: {}".format(e))
  243. self._logger.log_and_raise_error(err)
  244. def attach_space_from_module(self, module_path: str, name: str) -> None:
  245. """
  246. Attaches a space defined in a different python module.
  247. :param str module_path: path to python module
  248. where the space is defined.
  249. :param str name: name of the space loaded from
  250. a python module.
  251. """
  252. try:
  253. self._space = LoadingUtils().load_from_module(
  254. module_path=module_path, name=name)
  255. self.attached_space = True
  256. self._logger.info("Attached parameter distribution space")
  257. except Exception as e:
  258. err = ("Failed to attach space from module. "
  259. "Exit with error {}".format(e))
  260. self._logger.loger_and_raise_error(err)
  261. def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
  262. y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
  263. = None,
  264. X_val: Union[pd.DataFrame, np.ndarray]
  265. = None,
  266. y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
  267. = None,
  268. cv: Union[Iterable[Tuple[List[int], List[int]]]]
  269. = None) -> None:
  270. '''
  271. :param array X_train: data on which
  272. machine learning pipelines are trained
  273. :param array y_train: optional, vector with targets,
  274. (None in case of unsupervided learning)
  275. :param array X_val: optional, validation data.
  276. When not provided, cross-validated value
  277. of the cost_func is calculated.
  278. :param array y_val: optional, validation targets
  279. :param list cv: iterabe of tuples containing
  280. train and validation indices or an integer representing
  281. the number of folds for a random split of data
  282. during cross-validation
  283. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  284. '''
  285. try:
  286. assert((cv is None) == (X_val is not None)),\
  287. "Either cv or X_val must be provided"
  288. if cv is None:
  289. assert((y_val is None) == (y_train is None)),\
  290. "y_train and y_val must be simultanious"
  291. # Here we create a trivial cv object
  292. # with one validation split.
  293. # XXX Tanya finish here
  294. cv = CVComposer.dummy_cv()
  295. train_inds = list(range(len(X_train)))
  296. val_inds = list(range(len(X_train),
  297. len(X_train) + len(X_val)))
  298. self._cv = [(train_inds, val_inds)]
  299. self._X = np.concatenate([X_train, X_val])
  300. self._y = None if y_train is None\
  301. else np.concatenate([y_train, y_val])
  302. else:
  303. self._cv = cv
  304. self._X = X_train
  305. self._y = y_train
  306. self.attached_data = True
  307. self._logger.info("Attached data")
  308. except Exception as e:
  309. err = ("Failed to attach data. "
  310. "Exit with error: {}".format(e))
  311. self._logger.log_and_raise_error(err)
  312. def attach_data_from_hdf5(self,
  313. data_hdf5_store_path: str,
  314. cv_pickle_path: str = None) -> None:
  315. """
  316. Method for attaching data from a hdf5 store
  317. and a cv object from a pickled file.
  318. The hdf5 store is a binary file,
  319. after loading it, it is a dictionary with keys
  320. X_train (y_train, X_val, y_val).
  321. The cv is loaded from a pickle file.
  322. The reason to separate the data
  323. store from the cv store, is the hdf5 is optimized to
  324. store large dataframes (especially with simple types) and
  325. a a small list of lists like a cv-object is better
  326. to be stored as a pickle file.
  327. :param str data_hdf5_store_path: path to the hdf5 store
  328. with train and validation data
  329. :param str cv_pickle_path: path to the pickle file with
  330. the cv data
  331. """
  332. try:
  333. assert(os.path.isfile(data_hdf5_store_path)),\
  334. "Parameter hdf5_store_path is not a file"
  335. # close all opened files, because hdf5 will
  336. # fail to reopen an opened (for some reason) file
  337. import tables
  338. tables.file._open_files.close_all()
  339. store = pd.HDFStore(data_hdf5_store_path)
  340. self._data_path = data_hdf5_store_path
  341. data_input = {key: store[key] if key in store else None
  342. for key in ["X_train", "y_train", "X_val", "y_val"]}
  343. if cv_pickle_path is not None:
  344. assert(os.path.isfile(cv_pickle_path)),\
  345. "Parameter cv_pickle_path is not a file"
  346. data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
  347. self._cv_path = cv_pickle_path
  348. else:
  349. data_input["cv"] = None
  350. self.attach_data(**data_input)
  351. store.close()
  352. except Exception as e:
  353. err = "Failed to attach data. Exit with error: {}".format(e)
  354. self._logger.log_and_raise_error(err)
  355. @property
  356. def default_summary(self) -> dict:
  357. """
  358. Default summary of the strategy.
  359. Every the _objective function is called
  360. the current score and the information
  361. about the tested space element is added to the
  362. summary and it is saved to the Trials.
  363. If summary saving is configured it is also
  364. saved to a file, or a database when the score improves.
  365. """
  366. summary = {}
  367. if self._strategy_name is not None:
  368. summary["strategy_name"] = self._strategy_name
  369. if isinstance(self._cost_func, str):
  370. summary["cost_func"] = self._cost_func
  371. elif hasattr(self._cost_func, "__name__"):
  372. summary["cost_func"] = self._cost_func.__name__
  373. summary["trials_path"] = self.trials_path
  374. if self._data_path is not None:
  375. summary["data_path"] = self._data_path
  376. if self._cv_path is not None:
  377. summary["cv_path"] = self._cv_path
  378. summary["start_tuning_time"] = self.start_tuning_time
  379. summary["iteration"] = self._iteration
  380. return summary
  381. def configer_summary_saving(self,
  382. save_method: Callable
  383. = functools.partial(
  384. pd.DataFrame.to_excel,
  385. **{"path_or_buf": "result.csv"}),
  386. kwargs: dict = None) -> None:
  387. """
  388. When the score calculated by _objective function improves,
  389. the default summary is updated with information about the
  390. current score and pipeline/hyperparameters
  391. and can be saved to a file or database, depending
  392. on the configured save_method.
  393. :param Callable save_method: method for saving the result
  394. of the pipeline selection. The method must accept
  395. a pandas DataFrame as argument.
  396. By default, saving to an excel file.
  397. Examples:
  398. functools.partial(pd.DataFrame.to_csv,
  399. **{"path_or_buf": <PATH>})
  400. functools.partial(np.savetxt, **{"fname": <PATH>})
  401. functools.partial(SQLHandler(<URI>).append_to_table,
  402. **{"tablename": <NAME>})
  403. functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
  404. **{"collection_name": <NAME>})
  405. using functools can be avoided by providing the kwarg argument
  406. :param dict kwargs: a dictionary with keyword arguments
  407. (like tablename) to provide to the save_method
  408. """
  409. try:
  410. kwargs = kwargs or {}
  411. self._save_method = functools.partial(save_method, **kwargs)
  412. self.configured_summary_saving = True
  413. self._logger.info("Configured summary saving")
  414. except Exception as e:
  415. err = ("Failed to configure the summary saving. "
  416. "Exit with error {}".format(e))
  417. self._logger.log_and_raise_error(err)
  418. def _save_summary(self, summary: dict) -> None:
  419. """
  420. When the score calculated by _objective function improves,
  421. the default summary is updated with information about the
  422. current score and pipeline/hyperparameters
  423. and can be saved to a file or database, depending
  424. on the configured save_method.
  425. """
  426. try:
  427. assert(self.configured_summary_saving),\
  428. "Result saving must be configured first"
  429. self._save_method(summary)
  430. except Exception as e:
  431. err = ("Could not configure summary saving. "
  432. "Exit with error: {}".format(e))
  433. self._logger.log_and_raise_error(err)
  434. def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
  435. """
  436. Calculates the averaged cross-validated score and score variance,
  437. as well as the averaged values and variances of the additional metrics.
  438. This method is called in the _objective function that is
  439. passed to the hyperopt optimizer.
  440. This function can be overriden, when the cost
  441. needs to be calculated differently,
  442. for example with a tensorflow model.
  443. :param Pipeline pipeline: machine learning pipeline
  444. that will be evaluated with cross-validation
  445. :return: dictionary with the aggregated
  446. cross-validation scores and
  447. the score variances for the scores in the output
  448. of the cross-validation function.
  449. form of the output:
  450. {"score": 10, #score used in optimization,
  451. "score_variance": 0.5
  452. "additional_metric1": 5,
  453. "additional_metric1_variance": 7}
  454. a custom cross-validation function can also include for
  455. example probability threshold for each fold, then
  456. the output of this function will include the average
  457. value and the variance of the probability threshold
  458. over the folds.
  459. """
  460. try:
  461. scoring = {"score": self._cost_func} | self._additional_metrics
  462. if self._cross_validation_needs_scorer:
  463. for metric_name, metric in scoring.items():
  464. scoring[metric_name] = make_scorer(
  465. metric, greater_is_better=self._greater_is_better)
  466. cross_validation_input_args = {
  467. "estimator": pipeline,
  468. "X": self._X,
  469. "y": self._y,
  470. "cv": self._cv,
  471. "scoring": scoring
  472. }
  473. if "error_score" in self._cross_validation.__annotations__:
  474. cross_validation_input_args["error_score"] = np.nan
  475. scores = self._cross_validation(**cross_validation_input_args)
  476. averaging_funcs = {
  477. metric_name: self._additional_averaging_funcs[metric_name]
  478. if metric_name in self._additional_averaging_funcs
  479. else self._cross_val_averaging_func
  480. for metric_name in scores}
  481. scores_average = {
  482. metric_name.replace("test_", ""):
  483. averaging_funcs[metric_name](scores[metric_name])
  484. for metric_name in scores
  485. if metric_name.startswith("test")}
  486. scores_variance = {
  487. metric_name.replace("test_", "") + "_variance":
  488. np.var(scores[metric_name])
  489. for metric_name in scores
  490. if metric_name.startswith("test")}
  491. return {**scores_average, **scores_variance}
  492. except Exception as e:
  493. err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
  494. self._logger.log_and_raise_error(err)
  495. def _objective(self, space_element: SpaceElementType) -> dict:
  496. '''
  497. This method is called in run_trials method
  498. that is using the hyperopt fmin opmizer.
  499. Uses _evaluate method.
  500. It must take as input a space element
  501. and produce an output in the form of dictionary
  502. with 2 obligatory values loss and status
  503. (STATUS_OK or STATUS_FAIL). Other
  504. values in the output are optional and can be
  505. accessed later through the trials object.
  506. :Warning: fmin minimizes the loss,
  507. when _evaluate returns a value to be maximized,
  508. it is multiplied by -1 to obtain loss.
  509. :param SpaceElementType space_element: element
  510. of the space over which the optimization is done
  511. :output: dictionary with keys
  512. loss (minimized value),
  513. status with values STATUS_OK or STATUS_FAIL
  514. uderstood by hyperopt,
  515. score (equal to loss or -loss),
  516. score_variance,
  517. timestamp (end of execution),
  518. train_time: execution time
  519. and other keys given in self.default_summary
  520. '''
  521. try:
  522. start_time = time.time()
  523. assert(self.attached_data),\
  524. ("Data must be attached in order "
  525. "in order to effectuate the best"
  526. "pipeline search")
  527. summary = deepcopy(self.default_summary)
  528. # backup the current trials if the score improved
  529. # at previous iteration or every ith iteration
  530. # if the backup_trials_freq is set
  531. backup_cond = ((self._backup_trials_freq is not None) and
  532. ((self._iteration - self._start_iteration - 1) %
  533. self._backup_trials_freq == 0)) or\
  534. self._score_improved
  535. if backup_cond:
  536. self._backup_trials()
  537. self._score_improved = False
  538. pipeline = space_element['pipeline']
  539. params = space_element['params']
  540. pipeline.set_params(**params)
  541. self._logger.info(("Iteration {0}: "
  542. "Current score is {1}: "
  543. "Training pipeline {2} "
  544. "with parameters: {3}. ").format(
  545. self._iteration,
  546. self.best_score,
  547. space_element['name'],
  548. params))
  549. result = self._evaluate(pipeline)
  550. summary.update(result)
  551. end_time = time.time()
  552. summary['status'] = STATUS_OK
  553. summary.update(result)
  554. summary['loss'] = self._score_factor * summary['score']
  555. summary['timestamp'] = datetime.datetime.today()
  556. summary['train_time'] = end_time - start_time
  557. self._iteration += 1
  558. self._score_improved = (self.best_score != self.best_score) or\
  559. (self._score_factor*result["score"] <
  560. self._score_factor*self.best_score)
  561. if self._score_improved:
  562. self._logger.info("Score improved, new best score is: {}"
  563. .format(result["score"]))
  564. self.best_score = result['score']
  565. if self.configured_summary_saving:
  566. self._save_summary(summary)
  567. except Exception as e:
  568. self._logger.warning("Trial failed with error {}".format(e))
  569. summary = {}
  570. summary['status'] = STATUS_FAIL
  571. summary['timestamp'] = datetime.datetime.today()
  572. summary['error'] = e
  573. for key in ['loss', 'score', 'score_variance', 'train_time']:
  574. summary[key] = np.nan
  575. return summary
  576. @abstractmethod
  577. def run_trials(self):
  578. """
  579. Method that runs the hyperparameter tuning over possibly multiple
  580. pipeline types specified in self.space
  581. When run_trials method is finished the flag self.finished_tuning
  582. should be set to True and the methods self._backup_trials and
  583. optionally self._save_result should be called.
  584. """
  585. pass
  586. @abstractproperty
  587. def number_of_trials(self) -> int:
  588. """
  589. Number of trials already run in the current trials object
  590. """
  591. pass
  592. @abstractproperty
  593. def best_trial(self) -> dict:
  594. """
  595. Best trial sor far.
  596. Should contain the status, pipeline,
  597. hyperparameters, and the score (loss).
  598. Other information is otional and is defined
  599. by self.default_summary
  600. """
  601. pass
  602. @abstractproperty
  603. def best_trial_score(self) -> float:
  604. """
  605. Score of the best pipeline with the best hyperparameters
  606. """
  607. pass
  608. @abstractproperty
  609. def best_trial_score_variance(self) -> float:
  610. """
  611. Variance of the cross-validation score of the best pipeline
  612. """
  613. pass
  614. @abstractproperty
  615. def best_trial_pipeline(self) -> Pipeline:
  616. """
  617. Best pipeline with best hyperparameters
  618. """
  619. pass
  620. @abstractmethod
  621. def get_n_best_trial_pipelines(self, n: int) -> list:
  622. """
  623. N best pipelines with corresponding
  624. best hyperparameters
  625. """
  626. pass
  627. @abstractmethod
  628. def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
  629. """
  630. If the hyperparameter search is done over multiple
  631. pipelines, then returns n different pipeline-types
  632. with corresponding hyperparameters
  633. """
  634. pass
  635. @abstractmethod
  636. def trials_to_excel(self, path: str) -> None:
  637. """
  638. Trials object in the shape of table written to excel,
  639. should contain the iteration, pipeline (as str),
  640. hyperparamters (as str), self.best_result (see self._objective method)
  641. as well as additional information defined by self.default_summary
  642. """
  643. pass