PipelineSelector.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Sep 30 14:23:23 2020
  5. @author: tanya
  6. @description: an abstract class for selecting a machine learning
  7. pipeline from a space (deterministic or random) of parameter distributions
  8. over multiple pipelines.
  9. The selection is thought in such a way that a Trials object is being
  10. maintained during the tuning process from which one can retrieve
  11. the best pipeline so far
  12. as well as the entire tuning history if needed.
  13. Methods configure_cross_validation and configure_result_saving
  14. allow to use a custom cross-validation method and
  15. save the current best result in a file or database during training.
  16. Children classes: hyperopt and custom gridsearch.
  17. """
  18. import pickle
  19. import os
  20. import sys
  21. import time
  22. import datetime
  23. import numpy as np
  24. import pandas as pd
  25. from copy import deepcopy
  26. from abc import ABC, abstractmethod, abstractproperty
  27. if (sys.version_info.major == 3) & (sys.version_info.minor >= 8):
  28. print("I have python version {}.{} and will import typing".format(sys.version_info.major, sys.version_info.minor))
  29. from typing import Callable, TypedDict,\
  30. Literal, Dict, Iterable, List, Tuple, Union
  31. else:
  32. # from typing_extensions import *
  33. print("I have python version {}.{} and will import typing_extensions".format(sys.version_info.major, sys.version_info.minor))
  34. from __future__ import annotations
  35. Dict = dict
  36. List = list
  37. from typing_extensions import Callable, TypedDict,\
  38. Literal, Dict, Iterable, List, Tuple, Union
  39. import functools
  40. from sklearn.pipeline import Pipeline
  41. from sklearn.model_selection import cross_validate as sklearn_cross_validation
  42. from sklearn.metrics import make_scorer
  43. from hyperopt import STATUS_OK, STATUS_FAIL
  44. from cdplib.log import Log
  45. from cdplib.utils.ExceptionsHandler import ExceptionsHandler
  46. from cdplib.utils import LoadingUtils
  47. from cdplib.ml_validation import CVComposer
  48. sys.path.append(os.getcwd())
  49. class SpaceElementType(TypedDict):
  50. name: str
  51. pipeline: Pipeline
  52. params: dict
  53. # TODO Tanya: add possibility to include confusion matrix in
  54. # additional metrics
  55. # check that cv object contains indices
  56. class PipelineSelector(ABC):
  57. """
  58. An abstract class for selecting a machine learning
  59. pipeline from a space (deterministic or random) of parameter
  60. distributions over multiple pipelines.
  61. The selection is though in such a way that a Trials object is being
  62. maintained during the tuning process from which one can retrieve
  63. the best pipeline so far as well as the entire tuning history
  64. if needed.
  65. Methods configure_cross_validation and configure_result_saving
  66. allow to use a custom cross-validation method and
  67. save the current best result in a file or database during training.
  68. Children classes: hyperopt and custom gridsearch.
  69. """
  70. def __init__(self,
  71. cost_func: Union[Callable, str],
  72. greater_is_better: bool,
  73. trials_path: str,
  74. backup_trials_freq: int = None,
  75. cross_validation_needs_scorer: bool = True,
  76. cross_val_averaging_func: Callable = np.mean,
  77. additional_metrics: Dict[str, Callable] = None,
  78. additional_averaging_funcs: Dict[str, Callable] = None,
  79. strategy_name: str = None,
  80. stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
  81. = "INFO"):
  82. """
  83. :param Callable cost_func: function to minimize or maximize
  84. over the elements of a given (pipeline/hyperparameter) space
  85. :param bool greater_is_better: when True
  86. cost_func is maximized, else minimized.
  87. :param str trials_path: path at which the trials object is saved
  88. in binary format. From the trials object we can
  89. select information about the obtained scores, score variations,
  90. and pipelines, and parameters tried out so far. If a trials object
  91. already exists at the given path, it is loaded and the
  92. search is continued, else, the search is started from scratch.
  93. :param backup_trials_freq: frequecy in interations (trials)
  94. of saving the trials object at the trials_path.
  95. if None, the trials object is backed up avery time
  96. the score improves.
  97. :param Callable cross_val_averaging_func: Function to aggregate
  98. the cross-validation scores of the cost_func.
  99. Example different from the mean: mean - c*var.
  100. :param additional_metics: dict of additional metrics to keep track of
  101. in the trials of the form {"metric_name": metric}.
  102. :param additional_averaging_funcs: functions used to aggregate
  103. the output of the cross_validate function.
  104. The output always contains the scores of the cost_func,
  105. additional_metrics (if it is not empty),
  106. but it can also contain additional information
  107. (like probability threshold for example)
  108. if different from cross_val_averaging_func.
  109. Of the form {"metric_name": averaging_func}
  110. Remark:
  111. :param str strategy_name:
  112. a strategy is defined by the data set (columns/features and rows),
  113. cv object, cost function.
  114. When the strategy changes, one must start with new trials.
  115. :param str stdout_log_level: can be INFO, WARNING, ERROR
  116. """
  117. self._logger = Log("PipelineSelector: ",
  118. stdout_log_level=stdout_log_level)
  119. try:
  120. ExceptionsHandler(self._logger)\
  121. .assert_is_directory(path=trials_path)
  122. self.attached_space = False
  123. self.attached_data = False
  124. self.configured_cross_validation = False
  125. self.configured_summary_saving = False
  126. self._cost_func = cost_func
  127. self._greater_is_better = greater_is_better
  128. # score factor is 1 when cost_func is minimized,
  129. # -1 when cost func is maximized
  130. self._score_factor = (not greater_is_better) - greater_is_better
  131. self._cross_val_averaging_func = cross_val_averaging_func
  132. self._additional_metrics = additional_metrics
  133. self._additional_averaging_funcs = additional_averaging_funcs or {}
  134. self.trials_path = trials_path
  135. self._backup_trials_freq = backup_trials_freq
  136. self._strategy_name = strategy_name
  137. self._data_path = None
  138. self._cv_path = None
  139. self._X = None
  140. self._y = None
  141. self._cv = None
  142. self._space = None
  143. # if cross-valition is not configured,
  144. # sklearn cross-validation method is taken by default
  145. self._cross_validation = sklearn_cross_validation
  146. self._cross_validation_needs_scorer = cross_validation_needs_scorer
  147. # if a trials object already exists at the given path,
  148. # it is loaded and the search is continued. Else,
  149. # the search is started from the beginning.
  150. if os.path.isfile(self.trials_path):
  151. with open(self.trials_path, "rb") as f:
  152. self._trials = pickle.load(f)
  153. if len(self._trials) == 0:
  154. self._trials = None
  155. else:
  156. self._trials = None
  157. if self._trials is not None:
  158. self._start_iteration = self.number_of_trials
  159. self.best_score = self.best_trial_score
  160. self._logger.info(("Loaded an existing trials object"
  161. "Consisting of {} trials")
  162. .format(self._start_iteration))
  163. else:
  164. self._logger.warning(("No existing trials object was found, "
  165. "Starting from scratch."))
  166. self._trials = None
  167. self._start_iteration = 0
  168. self.best_score = np.nan
  169. # keeping track of the current search iteration
  170. self._iteration = self._start_iteration
  171. self._score_improved = False
  172. self.start_tuning_time = datetime.datetime.today()
  173. self.total_tuning_time = None
  174. self.finished_tuning = False
  175. except Exception as e:
  176. err = ("Failed to initialize the class. "
  177. "Exit with error: {}".format(e))
  178. self._logger.log_and_raise_error(err)
  179. def _backup_trials(self) -> None:
  180. '''
  181. Pickles (Saves) the trials object in binary format.
  182. '''
  183. try:
  184. with open(self.trials_path, "wb") as f:
  185. pickle.dump(self._trials, f)
  186. except Exception as e:
  187. err = "Could not backup trials. Exit with error: {}".format(e)
  188. self._logger.log_and_raise_error(err)
  189. def configure_cross_validation(self,
  190. cross_validation: Callable,
  191. kwargs: dict = None) -> None:
  192. """
  193. Method for attaching a custom cross-validation function
  194. :param cross_validation: a function that has the same
  195. signature as sklearn.model_selection.cross_validate
  196. """
  197. try:
  198. kwargs = kwargs or {}
  199. self._cross_validation = functools.partial(
  200. cross_validation, **kwargs)
  201. self.configured_cross_validation = True
  202. self._logger.info("Configured cross validation")
  203. except Exception as e:
  204. err = ("Failed to configure cross-validation. "
  205. "Exit with error: {}".format(e))
  206. self._logger.log_and_raise_error(err)
  207. def configure_cross_validation_from_module(self,
  208. module_path: str,
  209. name: str) -> None:
  210. """
  211. Attaches a cross-validation funciton defined in
  212. a different python model. This function must have
  213. the same signature as sklearn.model_seclection.cross_validate
  214. :param str module_path: path to python module
  215. where the cross_validation function is defined.
  216. :param str name: name of the cross validation function
  217. loaded froma python module.
  218. """
  219. try:
  220. self._cross_validation = \
  221. LoadingUtils().load_from_module(
  222. module_path=module_path, name=name)
  223. self.configured_cross_validation = True
  224. self._logger.info("Configured cross validation")
  225. except Exception as e:
  226. err = ("Failed to load cross-validation from module. "
  227. "Exit with error: {}".format(e))
  228. self._logger.log_and_raise_error(err)
  229. def attach_space(self, space) -> None:
  230. """
  231. Method for attaching the pipeline/hyperparameter space
  232. over which the score_func is optimized.
  233. :param space: space where
  234. the search is performed. A space might be either
  235. a list of dictionaries or a hyperopt space object
  236. the elements of which are dictionaries with keys:
  237. name, pipeline, params
  238. """
  239. try:
  240. self._space = space
  241. self.attached_space = True
  242. self._logger.info("Attached parameter distribution space")
  243. except Exception as e:
  244. err = ("Failed to attach space. "
  245. "Exit with error: {}".format(e))
  246. self._logger.log_and_raise_error(err)
  247. def attach_space_from_module(self, module_path: str, name: str) -> None:
  248. """
  249. Attaches a space defined in a different python module.
  250. :param str module_path: path to python module
  251. where the space is defined.
  252. :param str name: name of the space loaded from
  253. a python module.
  254. """
  255. try:
  256. self._space = LoadingUtils().load_from_module(
  257. module_path=module_path, name=name)
  258. self.attached_space = True
  259. self._logger.info("Attached parameter distribution space")
  260. except Exception as e:
  261. err = ("Failed to attach space from module. "
  262. "Exit with error {}".format(e))
  263. self._logger.loger_and_raise_error(err)
  264. def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
  265. y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
  266. = None,
  267. X_val: Union[pd.DataFrame, np.ndarray]
  268. = None,
  269. y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
  270. = None,
  271. cv: Union[Iterable[Tuple[List[int], List[int]]]]
  272. = None) -> None:
  273. '''
  274. :param array X_train: data on which
  275. machine learning pipelines are trained
  276. :param array y_train: optional, vector with targets,
  277. (None in case of unsupervided learning)
  278. :param array X_val: optional, validation data.
  279. When not provided, cross-validated value
  280. of the cost_func is calculated.
  281. :param array y_val: optional, validation targets
  282. :param list cv: iterabe of tuples containing
  283. train and validation indices or an integer representing
  284. the number of folds for a random split of data
  285. during cross-validation
  286. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  287. '''
  288. try:
  289. assert((cv is None) == (X_val is not None)),\
  290. "Either cv or X_val must be provided"
  291. if cv is None:
  292. assert((y_val is None) == (y_train is None)),\
  293. "y_train and y_val must be simultanious"
  294. # Here we create a trivial cv object
  295. # with one validation split.
  296. # XXX Tanya finish here
  297. cv = CVComposer.dummy_cv()
  298. train_inds = list(range(len(X_train)))
  299. val_inds = list(range(len(X_train),
  300. len(X_train) + len(X_val)))
  301. self._cv = [(train_inds, val_inds)]
  302. self._X = np.concatenate([X_train, X_val])
  303. self._y = None if y_train is None\
  304. else np.concatenate([y_train, y_val])
  305. else:
  306. self._cv = cv
  307. self._X = X_train
  308. self._y = y_train
  309. self.attached_data = True
  310. self._logger.info("Attached data")
  311. except Exception as e:
  312. err = ("Failed to attach data. "
  313. "Exit with error: {}".format(e))
  314. self._logger.log_and_raise_error(err)
  315. def attach_data_from_hdf5(self,
  316. data_hdf5_store_path: str,
  317. cv_pickle_path: str = None) -> None:
  318. """
  319. Method for attaching data from a hdf5 store
  320. and a cv object from a pickled file.
  321. The hdf5 store is a binary file,
  322. after loading it, it is a dictionary with keys
  323. X_train (y_train, X_val, y_val).
  324. The cv is loaded from a pickle file.
  325. The reason to separate the data
  326. store from the cv store, is the hdf5 is optimized to
  327. store large dataframes (especially with simple types) and
  328. a a small list of lists like a cv-object is better
  329. to be stored as a pickle file.
  330. :param str data_hdf5_store_path: path to the hdf5 store
  331. with train and validation data
  332. :param str cv_pickle_path: path to the pickle file with
  333. the cv data
  334. """
  335. try:
  336. assert(os.path.isfile(data_hdf5_store_path)),\
  337. "Parameter hdf5_store_path is not a file"
  338. # close all opened files, because hdf5 will
  339. # fail to reopen an opened (for some reason) file
  340. import tables
  341. tables.file._open_files.close_all()
  342. store = pd.HDFStore(data_hdf5_store_path)
  343. self._data_path = data_hdf5_store_path
  344. data_input = {key: store[key] if key in store else None
  345. for key in ["X_train", "y_train", "X_val", "y_val"]}
  346. if cv_pickle_path is not None:
  347. assert(os.path.isfile(cv_pickle_path)),\
  348. "Parameter cv_pickle_path is not a file"
  349. data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
  350. self._cv_path = cv_pickle_path
  351. else:
  352. data_input["cv"] = None
  353. self.attach_data(**data_input)
  354. store.close()
  355. except Exception as e:
  356. err = "Failed to attach data. Exit with error: {}".format(e)
  357. self._logger.log_and_raise_error(err)
  358. @property
  359. def default_summary(self) -> dict:
  360. """
  361. Default summary of the strategy.
  362. Every the _objective function is called
  363. the current score and the information
  364. about the tested space element is added to the
  365. summary and it is saved to the Trials.
  366. If summary saving is configured it is also
  367. saved to a file, or a database when the score improves.
  368. """
  369. summary = {}
  370. if self._strategy_name is not None:
  371. summary["strategy_name"] = self._strategy_name
  372. if isinstance(self._cost_func, str):
  373. summary["cost_func"] = self._cost_func
  374. elif hasattr(self._cost_func, "__name__"):
  375. summary["cost_func"] = self._cost_func.__name__
  376. summary["trials_path"] = self.trials_path
  377. if self._data_path is not None:
  378. summary["data_path"] = self._data_path
  379. if self._cv_path is not None:
  380. summary["cv_path"] = self._cv_path
  381. summary["start_tuning_time"] = self.start_tuning_time
  382. summary["iteration"] = self._iteration
  383. return summary
  384. def configer_summary_saving(self,
  385. save_method: Callable
  386. = functools.partial(
  387. pd.DataFrame.to_excel,
  388. **{"path_or_buf": "result.csv"}),
  389. kwargs: dict = None) -> None:
  390. """
  391. When the score calculated by _objective function improves,
  392. the default summary is updated with information about the
  393. current score and pipeline/hyperparameters
  394. and can be saved to a file or database, depending
  395. on the configured save_method.
  396. :param Callable save_method: method for saving the result
  397. of the pipeline selection. The method must accept
  398. a pandas DataFrame as argument.
  399. By default, saving to an excel file.
  400. Examples:
  401. functools.partial(pd.DataFrame.to_csv,
  402. **{"path_or_buf": <PATH>})
  403. functools.partial(np.savetxt, **{"fname": <PATH>})
  404. functools.partial(SQLHandler(<URI>).append_to_table,
  405. **{"tablename": <NAME>})
  406. functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
  407. **{"collection_name": <NAME>})
  408. using functools can be avoided by providing the kwarg argument
  409. :param dict kwargs: a dictionary with keyword arguments
  410. (like tablename) to provide to the save_method
  411. """
  412. try:
  413. kwargs = kwargs or {}
  414. self._save_method = functools.partial(save_method, **kwargs)
  415. self.configured_summary_saving = True
  416. self._logger.info("Configured summary saving")
  417. except Exception as e:
  418. err = ("Failed to configure the summary saving. "
  419. "Exit with error {}".format(e))
  420. self._logger.log_and_raise_error(err)
  421. def _save_summary(self, summary: dict) -> None:
  422. """
  423. When the score calculated by _objective function improves,
  424. the default summary is updated with information about the
  425. current score and pipeline/hyperparameters
  426. and can be saved to a file or database, depending
  427. on the configured save_method.
  428. """
  429. try:
  430. assert(self.configured_summary_saving),\
  431. "Result saving must be configured first"
  432. self._save_method(summary)
  433. except Exception as e:
  434. err = ("Could not configure summary saving. "
  435. "Exit with error: {}".format(e))
  436. self._logger.log_and_raise_error(err)
  437. def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
  438. """
  439. Calculates the averaged cross-validated score and score variance,
  440. as well as the averaged values and variances of the additional metrics.
  441. This method is called in the _objective function that is
  442. passed to the hyperopt optimizer.
  443. This function can be overriden, when the cost
  444. needs to be calculated differently,
  445. for example with a tensorflow model.
  446. :param Pipeline pipeline: machine learning pipeline
  447. that will be evaluated with cross-validation
  448. :return: dictionary with the aggregated
  449. cross-validation scores and
  450. the score variances for the scores in the output
  451. of the cross-validation function.
  452. form of the output:
  453. {"score": 10, #score used in optimization,
  454. "score_variance": 0.5
  455. "additional_metric1": 5,
  456. "additional_metric1_variance": 7}
  457. a custom cross-validation function can also include for
  458. example probability threshold for each fold, then
  459. the output of this function will include the average
  460. value and the variance of the probability threshold
  461. over the folds.
  462. """
  463. try:
  464. scoring = {"score": self._cost_func} | self._additional_metrics
  465. if self._cross_validation_needs_scorer:
  466. for metric_name, metric in scoring.items():
  467. scoring[metric_name] = make_scorer(
  468. metric, greater_is_better=self._greater_is_better)
  469. cross_validation_input_args = {
  470. "estimator": pipeline,
  471. "X": self._X,
  472. "y": self._y,
  473. "cv": self._cv,
  474. "scoring": scoring
  475. }
  476. if "error_score" in self._cross_validation.__annotations__:
  477. cross_validation_input_args["error_score"] = np.nan
  478. scores = self._cross_validation(**cross_validation_input_args)
  479. averaging_funcs = {
  480. metric_name: self._additional_averaging_funcs[metric_name]
  481. if metric_name in self._additional_averaging_funcs
  482. else self._cross_val_averaging_func
  483. for metric_name in scores}
  484. scores_average = {
  485. metric_name.replace("test_", ""):
  486. averaging_funcs[metric_name](scores[metric_name])
  487. for metric_name in scores
  488. if metric_name.startswith("test")}
  489. scores_variance = {
  490. metric_name.replace("test_", "") + "_variance":
  491. np.var(scores[metric_name])
  492. for metric_name in scores
  493. if metric_name.startswith("test")}
  494. return {**scores_average, **scores_variance}
  495. except Exception as e:
  496. err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
  497. self._logger.log_and_raise_error(err)
  498. def _objective(self, space_element: SpaceElementType) -> dict:
  499. '''
  500. This method is called in run_trials method
  501. that is using the hyperopt fmin opmizer.
  502. Uses _evaluate method.
  503. It must take as input a space element
  504. and produce an output in the form of dictionary
  505. with 2 obligatory values loss and status
  506. (STATUS_OK or STATUS_FAIL). Other
  507. values in the output are optional and can be
  508. accessed later through the trials object.
  509. :Warning: fmin minimizes the loss,
  510. when _evaluate returns a value to be maximized,
  511. it is multiplied by -1 to obtain loss.
  512. :param SpaceElementType space_element: element
  513. of the space over which the optimization is done
  514. :output: dictionary with keys
  515. loss (minimized value),
  516. status with values STATUS_OK or STATUS_FAIL
  517. uderstood by hyperopt,
  518. score (equal to loss or -loss),
  519. score_variance,
  520. timestamp (end of execution),
  521. train_time: execution time
  522. and other keys given in self.default_summary
  523. '''
  524. try:
  525. start_time = time.time()
  526. assert(self.attached_data),\
  527. ("Data must be attached in order "
  528. "in order to effectuate the best"
  529. "pipeline search")
  530. summary = deepcopy(self.default_summary)
  531. # backup the current trials if the score improved
  532. # at previous iteration or every ith iteration
  533. # if the backup_trials_freq is set
  534. backup_cond = ((self._backup_trials_freq is not None) and
  535. ((self._iteration - self._start_iteration - 1) %
  536. self._backup_trials_freq == 0)) or\
  537. self._score_improved
  538. if backup_cond:
  539. self._backup_trials()
  540. self._score_improved = False
  541. pipeline = space_element['pipeline']
  542. params = space_element['params']
  543. pipeline.set_params(**params)
  544. self._logger.info(("Iteration {0}: "
  545. "Current score is {1}: "
  546. "Training pipeline {2} "
  547. "with parameters: {3}. ").format(
  548. self._iteration,
  549. self.best_score,
  550. space_element['name'],
  551. params))
  552. result = self._evaluate(pipeline)
  553. summary.update(result)
  554. end_time = time.time()
  555. summary['status'] = STATUS_OK
  556. summary.update(result)
  557. summary['loss'] = self._score_factor * summary['score']
  558. summary['timestamp'] = datetime.datetime.today()
  559. summary['train_time'] = end_time - start_time
  560. self._iteration += 1
  561. self._score_improved = (self.best_score != self.best_score) or\
  562. (self._score_factor*result["score"] <
  563. self._score_factor*self.best_score)
  564. if self._score_improved:
  565. self._logger.info("Score improved, new best score is: {}"
  566. .format(result["score"]))
  567. self.best_score = result['score']
  568. if self.configured_summary_saving:
  569. self._save_summary(summary)
  570. except Exception as e:
  571. self._logger.warning("Trial failed with error {}".format(e))
  572. summary = {}
  573. summary['status'] = STATUS_FAIL
  574. summary['timestamp'] = datetime.datetime.today()
  575. summary['error'] = e
  576. for key in ['loss', 'score', 'score_variance', 'train_time']:
  577. summary[key] = np.nan
  578. return summary
  579. @abstractmethod
  580. def run_trials(self):
  581. """
  582. Method that runs the hyperparameter tuning over possibly multiple
  583. pipeline types specified in self.space
  584. When run_trials method is finished the flag self.finished_tuning
  585. should be set to True and the methods self._backup_trials and
  586. optionally self._save_result should be called.
  587. """
  588. pass
  589. @abstractproperty
  590. def number_of_trials(self) -> int:
  591. """
  592. Number of trials already run in the current trials object
  593. """
  594. pass
  595. @abstractproperty
  596. def best_trial(self) -> dict:
  597. """
  598. Best trial sor far.
  599. Should contain the status, pipeline,
  600. hyperparameters, and the score (loss).
  601. Other information is otional and is defined
  602. by self.default_summary
  603. """
  604. pass
  605. @abstractproperty
  606. def best_trial_score(self) -> float:
  607. """
  608. Score of the best pipeline with the best hyperparameters
  609. """
  610. pass
  611. @abstractproperty
  612. def best_trial_score_variance(self) -> float:
  613. """
  614. Variance of the cross-validation score of the best pipeline
  615. """
  616. pass
  617. @abstractproperty
  618. def best_trial_pipeline(self) -> Pipeline:
  619. """
  620. Best pipeline with best hyperparameters
  621. """
  622. pass
  623. @abstractmethod
  624. def get_n_best_trial_pipelines(self, n: int) -> list:
  625. """
  626. N best pipelines with corresponding
  627. best hyperparameters
  628. """
  629. pass
  630. @abstractmethod
  631. def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
  632. """
  633. If the hyperparameter search is done over multiple
  634. pipelines, then returns n different pipeline-types
  635. with corresponding hyperparameters
  636. """
  637. pass
  638. @abstractmethod
  639. def trials_to_excel(self, path: str) -> None:
  640. """
  641. Trials object in the shape of table written to excel,
  642. should contain the iteration, pipeline (as str),
  643. hyperparamters (as str), self.best_result (see self._objective method)
  644. as well as additional information defined by self.default_summary
  645. """
  646. pass