PipelineSelector.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Sep 30 14:23:23 2020
  5. @author: tanya
  6. @description: an abstract class for selecting a machine learning
  7. pipeline from a space (deterministic or random) of parameter distributions
  8. over multiple pipelines.
  9. The selection is thought in such a way that a Trials object is being
  10. maintained during the tuning process from which one can retrieve
  11. the best pipeline so far
  12. as well as the entire tuning history if needed.
  13. Methods configure_cross_validation and configure_result_saving
  14. allow to use a custom cross-validation method and
  15. save the current best result in a file or database during training.
  16. Children classes: hyperopt and custom gridsearch.
  17. """
  18. import pickle
  19. import os
  20. import sys
  21. import time
  22. import datetime
  23. import numpy as np
  24. import pandas as pd
  25. from copy import deepcopy
  26. from abc import ABC, abstractmethod, abstractproperty
  27. if (sys.version_info.major == 3) & (sys.version_info.minor >= 8):
  28. print("I have python version {}.{} and will import typing".format(sys.version_info.major, sys.version_info.minor))
  29. from typing import Callable, TypedDict,\
  30. Literal, Dict, Iterable, List, Tuple, Union
  31. else:
  32. # from typing_extensions import *
  33. print("I have python version {}.{} and will import typing_extensions".format(sys.version_info.major, sys.version_info.minor))
  34. from typing_extensions import Callable, TypedDict,\
  35. Literal, Iterable, Tuple, Union
  36. import functools
  37. from sklearn.pipeline import Pipeline
  38. from sklearn.model_selection import cross_validate as sklearn_cross_validation
  39. from sklearn.metrics import make_scorer
  40. from hyperopt import STATUS_OK, STATUS_FAIL
  41. from cdplib.log import Log
  42. from cdplib.utils.ExceptionsHandler import ExceptionsHandler
  43. from cdplib.utils import LoadingUtils
  44. from cdplib.ml_validation import CVComposer
  45. sys.path.append(os.getcwd())
  46. class SpaceElementType(TypedDict):
  47. name: str
  48. pipeline: Pipeline
  49. params: dict
  50. # TODO Tanya: add possibility to include confusion matrix in
  51. # additional metrics
  52. # check that cv object contains indices
  53. class PipelineSelector(ABC):
  54. """
  55. An abstract class for selecting a machine learning
  56. pipeline from a space (deterministic or random) of parameter
  57. distributions over multiple pipelines.
  58. The selection is though in such a way that a Trials object is being
  59. maintained during the tuning process from which one can retrieve
  60. the best pipeline so far as well as the entire tuning history
  61. if needed.
  62. Methods configure_cross_validation and configure_result_saving
  63. allow to use a custom cross-validation method and
  64. save the current best result in a file or database during training.
  65. Children classes: hyperopt and custom gridsearch.
  66. """
  67. def __init__(self,
  68. cost_func: Union[Callable, str],
  69. greater_is_better: bool,
  70. trials_path: str,
  71. backup_trials_freq: int = None,
  72. cross_validation_needs_scorer: bool = True,
  73. cross_val_averaging_func: Callable = np.mean,
  74. # additional_metrics: Dict[str, Callable] = None,
  75. additional_metrics = None,
  76. # additional_averaging_funcs: Dict[str, Callable] = None,
  77. additional_averaging_funcs = None,
  78. strategy_name: str = None,
  79. stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
  80. = "INFO"):
  81. """
  82. :param Callable cost_func: function to minimize or maximize
  83. over the elements of a given (pipeline/hyperparameter) space
  84. :param bool greater_is_better: when True
  85. cost_func is maximized, else minimized.
  86. :param str trials_path: path at which the trials object is saved
  87. in binary format. From the trials object we can
  88. select information about the obtained scores, score variations,
  89. and pipelines, and parameters tried out so far. If a trials object
  90. already exists at the given path, it is loaded and the
  91. search is continued, else, the search is started from scratch.
  92. :param backup_trials_freq: frequecy in interations (trials)
  93. of saving the trials object at the trials_path.
  94. if None, the trials object is backed up avery time
  95. the score improves.
  96. :param Callable cross_val_averaging_func: Function to aggregate
  97. the cross-validation scores of the cost_func.
  98. Example different from the mean: mean - c*var.
  99. :param additional_metics: dict of additional metrics to keep track of
  100. in the trials of the form {"metric_name": metric}.
  101. :param additional_averaging_funcs: functions used to aggregate
  102. the output of the cross_validate function.
  103. The output always contains the scores of the cost_func,
  104. additional_metrics (if it is not empty),
  105. but it can also contain additional information
  106. (like probability threshold for example)
  107. if different from cross_val_averaging_func.
  108. Of the form {"metric_name": averaging_func}
  109. Remark:
  110. :param str strategy_name:
  111. a strategy is defined by the data set (columns/features and rows),
  112. cv object, cost function.
  113. When the strategy changes, one must start with new trials.
  114. :param str stdout_log_level: can be INFO, WARNING, ERROR
  115. """
  116. self._logger = Log("PipelineSelector: ",
  117. stdout_log_level=stdout_log_level)
  118. try:
  119. ExceptionsHandler(self._logger)\
  120. .assert_is_directory(path=trials_path)
  121. self.attached_space = False
  122. self.attached_data = False
  123. self.configured_cross_validation = False
  124. self.configured_summary_saving = False
  125. self._cost_func = cost_func
  126. self._greater_is_better = greater_is_better
  127. # score factor is 1 when cost_func is minimized,
  128. # -1 when cost func is maximized
  129. self._score_factor = (not greater_is_better) - greater_is_better
  130. self._cross_val_averaging_func = cross_val_averaging_func
  131. self._additional_metrics = additional_metrics
  132. self._additional_averaging_funcs = additional_averaging_funcs or {}
  133. self.trials_path = trials_path
  134. self._backup_trials_freq = backup_trials_freq
  135. self._strategy_name = strategy_name
  136. self._data_path = None
  137. self._cv_path = None
  138. self._X = None
  139. self._y = None
  140. self._cv = None
  141. self._space = None
  142. # if cross-valition is not configured,
  143. # sklearn cross-validation method is taken by default
  144. self._cross_validation = sklearn_cross_validation
  145. self._cross_validation_needs_scorer = cross_validation_needs_scorer
  146. # if a trials object already exists at the given path,
  147. # it is loaded and the search is continued. Else,
  148. # the search is started from the beginning.
  149. if os.path.isfile(self.trials_path):
  150. with open(self.trials_path, "rb") as f:
  151. self._trials = pickle.load(f)
  152. if len(self._trials) == 0:
  153. self._trials = None
  154. else:
  155. self._trials = None
  156. if self._trials is not None:
  157. self._start_iteration = self.number_of_trials
  158. self.best_score = self.best_trial_score
  159. self._logger.info(("Loaded an existing trials object"
  160. "Consisting of {} trials")
  161. .format(self._start_iteration))
  162. else:
  163. self._logger.warning(("No existing trials object was found, "
  164. "Starting from scratch."))
  165. self._trials = None
  166. self._start_iteration = 0
  167. self.best_score = np.nan
  168. # keeping track of the current search iteration
  169. self._iteration = self._start_iteration
  170. self._score_improved = False
  171. self.start_tuning_time = datetime.datetime.today()
  172. self.total_tuning_time = None
  173. self.finished_tuning = False
  174. except Exception as e:
  175. err = ("Failed to initialize the class. "
  176. "Exit with error: {}".format(e))
  177. self._logger.log_and_raise_error(err)
  178. def _backup_trials(self) -> None:
  179. '''
  180. Pickles (Saves) the trials object in binary format.
  181. '''
  182. try:
  183. with open(self.trials_path, "wb") as f:
  184. pickle.dump(self._trials, f)
  185. except Exception as e:
  186. err = "Could not backup trials. Exit with error: {}".format(e)
  187. self._logger.log_and_raise_error(err)
  188. def configure_cross_validation(self,
  189. cross_validation: Callable,
  190. kwargs: dict = None) -> None:
  191. """
  192. Method for attaching a custom cross-validation function
  193. :param cross_validation: a function that has the same
  194. signature as sklearn.model_selection.cross_validate
  195. """
  196. try:
  197. kwargs = kwargs or {}
  198. self._cross_validation = functools.partial(
  199. cross_validation, **kwargs)
  200. self.configured_cross_validation = True
  201. self._logger.info("Configured cross validation")
  202. except Exception as e:
  203. err = ("Failed to configure cross-validation. "
  204. "Exit with error: {}".format(e))
  205. self._logger.log_and_raise_error(err)
  206. def configure_cross_validation_from_module(self,
  207. module_path: str,
  208. name: str) -> None:
  209. """
  210. Attaches a cross-validation funciton defined in
  211. a different python model. This function must have
  212. the same signature as sklearn.model_seclection.cross_validate
  213. :param str module_path: path to python module
  214. where the cross_validation function is defined.
  215. :param str name: name of the cross validation function
  216. loaded froma python module.
  217. """
  218. try:
  219. self._cross_validation = \
  220. LoadingUtils().load_from_module(
  221. module_path=module_path, name=name)
  222. self.configured_cross_validation = True
  223. self._logger.info("Configured cross validation")
  224. except Exception as e:
  225. err = ("Failed to load cross-validation from module. "
  226. "Exit with error: {}".format(e))
  227. self._logger.log_and_raise_error(err)
  228. def attach_space(self, space) -> None:
  229. """
  230. Method for attaching the pipeline/hyperparameter space
  231. over which the score_func is optimized.
  232. :param space: space where
  233. the search is performed. A space might be either
  234. a list of dictionaries or a hyperopt space object
  235. the elements of which are dictionaries with keys:
  236. name, pipeline, params
  237. """
  238. try:
  239. self._space = space
  240. self.attached_space = True
  241. self._logger.info("Attached parameter distribution space")
  242. except Exception as e:
  243. err = ("Failed to attach space. "
  244. "Exit with error: {}".format(e))
  245. self._logger.log_and_raise_error(err)
  246. def attach_space_from_module(self, module_path: str, name: str) -> None:
  247. """
  248. Attaches a space defined in a different python module.
  249. :param str module_path: path to python module
  250. where the space is defined.
  251. :param str name: name of the space loaded from
  252. a python module.
  253. """
  254. try:
  255. self._space = LoadingUtils().load_from_module(
  256. module_path=module_path, name=name)
  257. self.attached_space = True
  258. self._logger.info("Attached parameter distribution space")
  259. except Exception as e:
  260. err = ("Failed to attach space from module. "
  261. "Exit with error {}".format(e))
  262. self._logger.loger_and_raise_error(err)
  263. def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
  264. y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
  265. = None,
  266. X_val: Union[pd.DataFrame, np.ndarray]
  267. = None,
  268. y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
  269. = None,
  270. cv: Union[Iterable[Tuple[List[int], List[int]]]]
  271. = None) -> None:
  272. '''
  273. :param array X_train: data on which
  274. machine learning pipelines are trained
  275. :param array y_train: optional, vector with targets,
  276. (None in case of unsupervided learning)
  277. :param array X_val: optional, validation data.
  278. When not provided, cross-validated value
  279. of the cost_func is calculated.
  280. :param array y_val: optional, validation targets
  281. :param list cv: iterabe of tuples containing
  282. train and validation indices or an integer representing
  283. the number of folds for a random split of data
  284. during cross-validation
  285. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  286. '''
  287. try:
  288. assert((cv is None) == (X_val is not None)),\
  289. "Either cv or X_val must be provided"
  290. if cv is None:
  291. assert((y_val is None) == (y_train is None)),\
  292. "y_train and y_val must be simultanious"
  293. # Here we create a trivial cv object
  294. # with one validation split.
  295. # XXX Tanya finish here
  296. cv = CVComposer.dummy_cv()
  297. train_inds = list(range(len(X_train)))
  298. val_inds = list(range(len(X_train),
  299. len(X_train) + len(X_val)))
  300. self._cv = [(train_inds, val_inds)]
  301. self._X = np.concatenate([X_train, X_val])
  302. self._y = None if y_train is None\
  303. else np.concatenate([y_train, y_val])
  304. else:
  305. self._cv = cv
  306. self._X = X_train
  307. self._y = y_train
  308. self.attached_data = True
  309. self._logger.info("Attached data")
  310. except Exception as e:
  311. err = ("Failed to attach data. "
  312. "Exit with error: {}".format(e))
  313. self._logger.log_and_raise_error(err)
  314. def attach_data_from_hdf5(self,
  315. data_hdf5_store_path: str,
  316. cv_pickle_path: str = None) -> None:
  317. """
  318. Method for attaching data from a hdf5 store
  319. and a cv object from a pickled file.
  320. The hdf5 store is a binary file,
  321. after loading it, it is a dictionary with keys
  322. X_train (y_train, X_val, y_val).
  323. The cv is loaded from a pickle file.
  324. The reason to separate the data
  325. store from the cv store, is the hdf5 is optimized to
  326. store large dataframes (especially with simple types) and
  327. a a small list of lists like a cv-object is better
  328. to be stored as a pickle file.
  329. :param str data_hdf5_store_path: path to the hdf5 store
  330. with train and validation data
  331. :param str cv_pickle_path: path to the pickle file with
  332. the cv data
  333. """
  334. try:
  335. assert(os.path.isfile(data_hdf5_store_path)),\
  336. "Parameter hdf5_store_path is not a file"
  337. # close all opened files, because hdf5 will
  338. # fail to reopen an opened (for some reason) file
  339. import tables
  340. tables.file._open_files.close_all()
  341. store = pd.HDFStore(data_hdf5_store_path)
  342. self._data_path = data_hdf5_store_path
  343. data_input = {key: store[key] if key in store else None
  344. for key in ["X_train", "y_train", "X_val", "y_val"]}
  345. if cv_pickle_path is not None:
  346. assert(os.path.isfile(cv_pickle_path)),\
  347. "Parameter cv_pickle_path is not a file"
  348. data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
  349. self._cv_path = cv_pickle_path
  350. else:
  351. data_input["cv"] = None
  352. self.attach_data(**data_input)
  353. store.close()
  354. except Exception as e:
  355. err = "Failed to attach data. Exit with error: {}".format(e)
  356. self._logger.log_and_raise_error(err)
  357. @property
  358. def default_summary(self) -> dict:
  359. """
  360. Default summary of the strategy.
  361. Every the _objective function is called
  362. the current score and the information
  363. about the tested space element is added to the
  364. summary and it is saved to the Trials.
  365. If summary saving is configured it is also
  366. saved to a file, or a database when the score improves.
  367. """
  368. summary = {}
  369. if self._strategy_name is not None:
  370. summary["strategy_name"] = self._strategy_name
  371. if isinstance(self._cost_func, str):
  372. summary["cost_func"] = self._cost_func
  373. elif hasattr(self._cost_func, "__name__"):
  374. summary["cost_func"] = self._cost_func.__name__
  375. summary["trials_path"] = self.trials_path
  376. if self._data_path is not None:
  377. summary["data_path"] = self._data_path
  378. if self._cv_path is not None:
  379. summary["cv_path"] = self._cv_path
  380. summary["start_tuning_time"] = self.start_tuning_time
  381. summary["iteration"] = self._iteration
  382. return summary
  383. def configer_summary_saving(self,
  384. save_method: Callable
  385. = functools.partial(
  386. pd.DataFrame.to_excel,
  387. **{"path_or_buf": "result.csv"}),
  388. kwargs: dict = None) -> None:
  389. """
  390. When the score calculated by _objective function improves,
  391. the default summary is updated with information about the
  392. current score and pipeline/hyperparameters
  393. and can be saved to a file or database, depending
  394. on the configured save_method.
  395. :param Callable save_method: method for saving the result
  396. of the pipeline selection. The method must accept
  397. a pandas DataFrame as argument.
  398. By default, saving to an excel file.
  399. Examples:
  400. functools.partial(pd.DataFrame.to_csv,
  401. **{"path_or_buf": <PATH>})
  402. functools.partial(np.savetxt, **{"fname": <PATH>})
  403. functools.partial(SQLHandler(<URI>).append_to_table,
  404. **{"tablename": <NAME>})
  405. functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
  406. **{"collection_name": <NAME>})
  407. using functools can be avoided by providing the kwarg argument
  408. :param dict kwargs: a dictionary with keyword arguments
  409. (like tablename) to provide to the save_method
  410. """
  411. try:
  412. kwargs = kwargs or {}
  413. self._save_method = functools.partial(save_method, **kwargs)
  414. self.configured_summary_saving = True
  415. self._logger.info("Configured summary saving")
  416. except Exception as e:
  417. err = ("Failed to configure the summary saving. "
  418. "Exit with error {}".format(e))
  419. self._logger.log_and_raise_error(err)
  420. def _save_summary(self, summary: dict) -> None:
  421. """
  422. When the score calculated by _objective function improves,
  423. the default summary is updated with information about the
  424. current score and pipeline/hyperparameters
  425. and can be saved to a file or database, depending
  426. on the configured save_method.
  427. """
  428. try:
  429. assert(self.configured_summary_saving),\
  430. "Result saving must be configured first"
  431. self._save_method(summary)
  432. except Exception as e:
  433. err = ("Could not configure summary saving. "
  434. "Exit with error: {}".format(e))
  435. self._logger.log_and_raise_error(err)
  436. def _evaluate(self, pipeline: Pipeline) :#-> Union[Dict[str, float], None]:
  437. """
  438. Calculates the averaged cross-validated score and score variance,
  439. as well as the averaged values and variances of the additional metrics.
  440. This method is called in the _objective function that is
  441. passed to the hyperopt optimizer.
  442. This function can be overriden, when the cost
  443. needs to be calculated differently,
  444. for example with a tensorflow model.
  445. :param Pipeline pipeline: machine learning pipeline
  446. that will be evaluated with cross-validation
  447. :return: dictionary with the aggregated
  448. cross-validation scores and
  449. the score variances for the scores in the output
  450. of the cross-validation function.
  451. form of the output:
  452. {"score": 10, #score used in optimization,
  453. "score_variance": 0.5
  454. "additional_metric1": 5,
  455. "additional_metric1_variance": 7}
  456. a custom cross-validation function can also include for
  457. example probability threshold for each fold, then
  458. the output of this function will include the average
  459. value and the variance of the probability threshold
  460. over the folds.
  461. """
  462. try:
  463. scoring = {"score": self._cost_func} | self._additional_metrics
  464. if self._cross_validation_needs_scorer:
  465. for metric_name, metric in scoring.items():
  466. scoring[metric_name] = make_scorer(
  467. metric, greater_is_better=self._greater_is_better)
  468. cross_validation_input_args = {
  469. "estimator": pipeline,
  470. "X": self._X,
  471. "y": self._y,
  472. "cv": self._cv,
  473. "scoring": scoring
  474. }
  475. if "error_score" in self._cross_validation.__annotations__:
  476. cross_validation_input_args["error_score"] = np.nan
  477. scores = self._cross_validation(**cross_validation_input_args)
  478. averaging_funcs = {
  479. metric_name: self._additional_averaging_funcs[metric_name]
  480. if metric_name in self._additional_averaging_funcs
  481. else self._cross_val_averaging_func
  482. for metric_name in scores}
  483. scores_average = {
  484. metric_name.replace("test_", ""):
  485. averaging_funcs[metric_name](scores[metric_name])
  486. for metric_name in scores
  487. if metric_name.startswith("test")}
  488. scores_variance = {
  489. metric_name.replace("test_", "") + "_variance":
  490. np.var(scores[metric_name])
  491. for metric_name in scores
  492. if metric_name.startswith("test")}
  493. return {**scores_average, **scores_variance}
  494. except Exception as e:
  495. err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
  496. self._logger.log_and_raise_error(err)
  497. def _objective(self, space_element: SpaceElementType) -> dict:
  498. '''
  499. This method is called in run_trials method
  500. that is using the hyperopt fmin opmizer.
  501. Uses _evaluate method.
  502. It must take as input a space element
  503. and produce an output in the form of dictionary
  504. with 2 obligatory values loss and status
  505. (STATUS_OK or STATUS_FAIL). Other
  506. values in the output are optional and can be
  507. accessed later through the trials object.
  508. :Warning: fmin minimizes the loss,
  509. when _evaluate returns a value to be maximized,
  510. it is multiplied by -1 to obtain loss.
  511. :param SpaceElementType space_element: element
  512. of the space over which the optimization is done
  513. :output: dictionary with keys
  514. loss (minimized value),
  515. status with values STATUS_OK or STATUS_FAIL
  516. uderstood by hyperopt,
  517. score (equal to loss or -loss),
  518. score_variance,
  519. timestamp (end of execution),
  520. train_time: execution time
  521. and other keys given in self.default_summary
  522. '''
  523. try:
  524. start_time = time.time()
  525. assert(self.attached_data),\
  526. ("Data must be attached in order "
  527. "in order to effectuate the best"
  528. "pipeline search")
  529. summary = deepcopy(self.default_summary)
  530. # backup the current trials if the score improved
  531. # at previous iteration or every ith iteration
  532. # if the backup_trials_freq is set
  533. backup_cond = ((self._backup_trials_freq is not None) and
  534. ((self._iteration - self._start_iteration - 1) %
  535. self._backup_trials_freq == 0)) or\
  536. self._score_improved
  537. if backup_cond:
  538. self._backup_trials()
  539. self._score_improved = False
  540. pipeline = space_element['pipeline']
  541. params = space_element['params']
  542. pipeline.set_params(**params)
  543. self._logger.info(("Iteration {0}: "
  544. "Current score is {1}: "
  545. "Training pipeline {2} "
  546. "with parameters: {3}. ").format(
  547. self._iteration,
  548. self.best_score,
  549. space_element['name'],
  550. params))
  551. result = self._evaluate(pipeline)
  552. summary.update(result)
  553. end_time = time.time()
  554. summary['status'] = STATUS_OK
  555. summary.update(result)
  556. summary['loss'] = self._score_factor * summary['score']
  557. summary['timestamp'] = datetime.datetime.today()
  558. summary['train_time'] = end_time - start_time
  559. self._iteration += 1
  560. self._score_improved = (self.best_score != self.best_score) or\
  561. (self._score_factor*result["score"] <
  562. self._score_factor*self.best_score)
  563. if self._score_improved:
  564. self._logger.info("Score improved, new best score is: {}"
  565. .format(result["score"]))
  566. self.best_score = result['score']
  567. if self.configured_summary_saving:
  568. self._save_summary(summary)
  569. except Exception as e:
  570. self._logger.warning("Trial failed with error {}".format(e))
  571. summary = {}
  572. summary['status'] = STATUS_FAIL
  573. summary['timestamp'] = datetime.datetime.today()
  574. summary['error'] = e
  575. for key in ['loss', 'score', 'score_variance', 'train_time']:
  576. summary[key] = np.nan
  577. return summary
  578. @abstractmethod
  579. def run_trials(self):
  580. """
  581. Method that runs the hyperparameter tuning over possibly multiple
  582. pipeline types specified in self.space
  583. When run_trials method is finished the flag self.finished_tuning
  584. should be set to True and the methods self._backup_trials and
  585. optionally self._save_result should be called.
  586. """
  587. pass
  588. @abstractproperty
  589. def number_of_trials(self) -> int:
  590. """
  591. Number of trials already run in the current trials object
  592. """
  593. pass
  594. @abstractproperty
  595. def best_trial(self) -> dict:
  596. """
  597. Best trial sor far.
  598. Should contain the status, pipeline,
  599. hyperparameters, and the score (loss).
  600. Other information is otional and is defined
  601. by self.default_summary
  602. """
  603. pass
  604. @abstractproperty
  605. def best_trial_score(self) -> float:
  606. """
  607. Score of the best pipeline with the best hyperparameters
  608. """
  609. pass
  610. @abstractproperty
  611. def best_trial_score_variance(self) -> float:
  612. """
  613. Variance of the cross-validation score of the best pipeline
  614. """
  615. pass
  616. @abstractproperty
  617. def best_trial_pipeline(self) -> Pipeline:
  618. """
  619. Best pipeline with best hyperparameters
  620. """
  621. pass
  622. @abstractmethod
  623. def get_n_best_trial_pipelines(self, n: int) -> list:
  624. """
  625. N best pipelines with corresponding
  626. best hyperparameters
  627. """
  628. pass
  629. @abstractmethod
  630. def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
  631. """
  632. If the hyperparameter search is done over multiple
  633. pipelines, then returns n different pipeline-types
  634. with corresponding hyperparameters
  635. """
  636. pass
  637. @abstractmethod
  638. def trials_to_excel(self, path: str) -> None:
  639. """
  640. Trials object in the shape of table written to excel,
  641. should contain the iteration, pipeline (as str),
  642. hyperparamters (as str), self.best_result (see self._objective method)
  643. as well as additional information defined by self.default_summary
  644. """
  645. pass