PipelineSelector.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Sep 30 14:23:23 2020
  5. @author: tanya
  6. @description: an abstract class for selecting a machine learning
  7. pipeline from a space (deterministic or random) of parameter distributions
  8. over multiple pipelines.
  9. The selection is thought in such a way that a Trials object is being
  10. maintained during the tuning process from which one can retrieve
  11. the best pipeline so far
  12. as well as the entire tuning history if needed.
  13. Methods configure_cross_validation and configure_result_saving
  14. allow to use a custom cross-validation method and
  15. save the current best result in a file or database during training.
  16. Children classes: hyperopt and custom gridsearch.
  17. """
  18. import pickle
  19. import os
  20. import sys
  21. import time
  22. import datetime
  23. import numpy as np
  24. import pandas as pd
  25. from copy import deepcopy
  26. from abc import ABC, abstractmethod, abstractproperty
  27. if (sys.version_info.major == 3) & (sys.version_info.minor >= 8):
  28. from typing import Callable, TypedDict,\
  29. Literal, Dict, Iterable, List, Tuple, Union
  30. else:
  31. from typing_extensions import *
  32. # from typing_extensions import Callable, TypedDict,\
  33. # Literal, Dict, Iterable, List, Tuple, Union
  34. import functools
  35. from sklearn.pipeline import Pipeline
  36. from sklearn.model_selection import cross_validate as sklearn_cross_validation
  37. from sklearn.metrics import make_scorer
  38. from hyperopt import STATUS_OK, STATUS_FAIL
  39. from cdplib.log import Log
  40. from cdplib.utils import ExceptionsHandler
  41. from cdplib.utils import LoadingUtils
  42. from cdplib.ml_validation import CVComposer
  43. sys.path.append(os.getcwd())
  44. class SpaceElementType(TypedDict):
  45. name: str
  46. pipeline: Pipeline
  47. params: dict
  48. # TODO Tanya: add possibility to include confusion matrix in
  49. # additional metrics
  50. # check that cv object contains indices
  51. class PipelineSelector(ABC):
  52. """
  53. An abstract class for selecting a machine learning
  54. pipeline from a space (deterministic or random) of parameter
  55. distributions over multiple pipelines.
  56. The selection is though in such a way that a Trials object is being
  57. maintained during the tuning process from which one can retrieve
  58. the best pipeline so far as well as the entire tuning history
  59. if needed.
  60. Methods configure_cross_validation and configure_result_saving
  61. allow to use a custom cross-validation method and
  62. save the current best result in a file or database during training.
  63. Children classes: hyperopt and custom gridsearch.
  64. """
  65. def __init__(self,
  66. cost_func: Union[Callable, str],
  67. greater_is_better: bool,
  68. trials_path: str,
  69. backup_trials_freq: int = None,
  70. cross_validation_needs_scorer: bool = True,
  71. cross_val_averaging_func: Callable = np.mean,
  72. additional_metrics: Dict[str, Callable] = None,
  73. additional_averaging_funcs: Dict[str, Callable] = None,
  74. strategy_name: str = None,
  75. stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
  76. = "INFO"):
  77. """
  78. :param Callable cost_func: function to minimize or maximize
  79. over the elements of a given (pipeline/hyperparameter) space
  80. :param bool greater_is_better: when True
  81. cost_func is maximized, else minimized.
  82. :param str trials_path: path at which the trials object is saved
  83. in binary format. From the trials object we can
  84. select information about the obtained scores, score variations,
  85. and pipelines, and parameters tried out so far. If a trials object
  86. already exists at the given path, it is loaded and the
  87. search is continued, else, the search is started from scratch.
  88. :param backup_trials_freq: frequecy in interations (trials)
  89. of saving the trials object at the trials_path.
  90. if None, the trials object is backed up avery time
  91. the score improves.
  92. :param Callable cross_val_averaging_func: Function to aggregate
  93. the cross-validation scores of the cost_func.
  94. Example different from the mean: mean - c*var.
  95. :param additional_metics: dict of additional metrics to keep track of
  96. in the trials of the form {"metric_name": metric}.
  97. :param additional_averaging_funcs: functions used to aggregate
  98. the output of the cross_validate function.
  99. The output always contains the scores of the cost_func,
  100. additional_metrics (if it is not empty),
  101. but it can also contain additional information
  102. (like probability threshold for example)
  103. if different from cross_val_averaging_func.
  104. Of the form {"metric_name": averaging_func}
  105. Remark:
  106. :param str strategy_name:
  107. a strategy is defined by the data set (columns/features and rows),
  108. cv object, cost function.
  109. When the strategy changes, one must start with new trials.
  110. :param str stdout_log_level: can be INFO, WARNING, ERROR
  111. """
  112. self._logger = Log("PipelineSelector: ",
  113. stdout_log_level=stdout_log_level)
  114. try:
  115. ExceptionsHandler(self._logger)\
  116. .assert_is_directory(path=trials_path)
  117. self.attached_space = False
  118. self.attached_data = False
  119. self.configured_cross_validation = False
  120. self.configured_summary_saving = False
  121. self._cost_func = cost_func
  122. self._greater_is_better = greater_is_better
  123. # score factor is 1 when cost_func is minimized,
  124. # -1 when cost func is maximized
  125. self._score_factor = (not greater_is_better) - greater_is_better
  126. self._cross_val_averaging_func = cross_val_averaging_func
  127. self._additional_metrics = additional_metrics
  128. self._additional_averaging_funcs = additional_averaging_funcs or {}
  129. self.trials_path = trials_path
  130. self._backup_trials_freq = backup_trials_freq
  131. self._strategy_name = strategy_name
  132. self._data_path = None
  133. self._cv_path = None
  134. self._X = None
  135. self._y = None
  136. self._cv = None
  137. self._space = None
  138. # if cross-valition is not configured,
  139. # sklearn cross-validation method is taken by default
  140. self._cross_validation = sklearn_cross_validation
  141. self._cross_validation_needs_scorer = cross_validation_needs_scorer
  142. # if a trials object already exists at the given path,
  143. # it is loaded and the search is continued. Else,
  144. # the search is started from the beginning.
  145. if os.path.isfile(self.trials_path):
  146. with open(self.trials_path, "rb") as f:
  147. self._trials = pickle.load(f)
  148. if len(self._trials) == 0:
  149. self._trials = None
  150. else:
  151. self._trials = None
  152. if self._trials is not None:
  153. self._start_iteration = self.number_of_trials
  154. self.best_score = self.best_trial_score
  155. self._logger.info(("Loaded an existing trials object"
  156. "Consisting of {} trials")
  157. .format(self._start_iteration))
  158. else:
  159. self._logger.warning(("No existing trials object was found, "
  160. "Starting from scratch."))
  161. self._trials = None
  162. self._start_iteration = 0
  163. self.best_score = np.nan
  164. # keeping track of the current search iteration
  165. self._iteration = self._start_iteration
  166. self._score_improved = False
  167. self.start_tuning_time = datetime.datetime.today()
  168. self.total_tuning_time = None
  169. self.finished_tuning = False
  170. except Exception as e:
  171. err = ("Failed to initialize the class. "
  172. "Exit with error: {}".format(e))
  173. self._logger.log_and_raise_error(err)
  174. def _backup_trials(self) -> None:
  175. '''
  176. Pickles (Saves) the trials object in binary format.
  177. '''
  178. try:
  179. with open(self.trials_path, "wb") as f:
  180. pickle.dump(self._trials, f)
  181. except Exception as e:
  182. err = "Could not backup trials. Exit with error: {}".format(e)
  183. self._logger.log_and_raise_error(err)
  184. def configure_cross_validation(self,
  185. cross_validation: Callable,
  186. kwargs: dict = None) -> None:
  187. """
  188. Method for attaching a custom cross-validation function
  189. :param cross_validation: a function that has the same
  190. signature as sklearn.model_selection.cross_validate
  191. """
  192. try:
  193. kwargs = kwargs or {}
  194. self._cross_validation = functools.partial(
  195. cross_validation, **kwargs)
  196. self.configured_cross_validation = True
  197. self._logger.info("Configured cross validation")
  198. except Exception as e:
  199. err = ("Failed to configure cross-validation. "
  200. "Exit with error: {}".format(e))
  201. self._logger.log_and_raise_error(err)
  202. def configure_cross_validation_from_module(self,
  203. module_path: str,
  204. name: str) -> None:
  205. """
  206. Attaches a cross-validation funciton defined in
  207. a different python model. This function must have
  208. the same signature as sklearn.model_seclection.cross_validate
  209. :param str module_path: path to python module
  210. where the cross_validation function is defined.
  211. :param str name: name of the cross validation function
  212. loaded froma python module.
  213. """
  214. try:
  215. self._cross_validation = \
  216. LoadingUtils().load_from_module(
  217. module_path=module_path, name=name)
  218. self.configured_cross_validation = True
  219. self._logger.info("Configured cross validation")
  220. except Exception as e:
  221. err = ("Failed to load cross-validation from module. "
  222. "Exit with error: {}".format(e))
  223. self._logger.log_and_raise_error(err)
  224. def attach_space(self, space) -> None:
  225. """
  226. Method for attaching the pipeline/hyperparameter space
  227. over which the score_func is optimized.
  228. :param space: space where
  229. the search is performed. A space might be either
  230. a list of dictionaries or a hyperopt space object
  231. the elements of which are dictionaries with keys:
  232. name, pipeline, params
  233. """
  234. try:
  235. self._space = space
  236. self.attached_space = True
  237. self._logger.info("Attached parameter distribution space")
  238. except Exception as e:
  239. err = ("Failed to attach space. "
  240. "Exit with error: {}".format(e))
  241. self._logger.log_and_raise_error(err)
  242. def attach_space_from_module(self, module_path: str, name: str) -> None:
  243. """
  244. Attaches a space defined in a different python module.
  245. :param str module_path: path to python module
  246. where the space is defined.
  247. :param str name: name of the space loaded from
  248. a python module.
  249. """
  250. try:
  251. self._space = LoadingUtils().load_from_module(
  252. module_path=module_path, name=name)
  253. self.attached_space = True
  254. self._logger.info("Attached parameter distribution space")
  255. except Exception as e:
  256. err = ("Failed to attach space from module. "
  257. "Exit with error {}".format(e))
  258. self._logger.loger_and_raise_error(err)
  259. def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
  260. y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
  261. = None,
  262. X_val: Union[pd.DataFrame, np.ndarray]
  263. = None,
  264. y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
  265. = None,
  266. cv: Union[Iterable[Tuple[List[int], List[int]]]]
  267. = None) -> None:
  268. '''
  269. :param array X_train: data on which
  270. machine learning pipelines are trained
  271. :param array y_train: optional, vector with targets,
  272. (None in case of unsupervided learning)
  273. :param array X_val: optional, validation data.
  274. When not provided, cross-validated value
  275. of the cost_func is calculated.
  276. :param array y_val: optional, validation targets
  277. :param list cv: iterabe of tuples containing
  278. train and validation indices or an integer representing
  279. the number of folds for a random split of data
  280. during cross-validation
  281. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  282. '''
  283. try:
  284. assert((cv is None) == (X_val is not None)),\
  285. "Either cv or X_val must be provided"
  286. if cv is None:
  287. assert((y_val is None) == (y_train is None)),\
  288. "y_train and y_val must be simultanious"
  289. # Here we create a trivial cv object
  290. # with one validation split.
  291. # XXX Tanya finish here
  292. cv = CVComposer.dummy_cv()
  293. train_inds = list(range(len(X_train)))
  294. val_inds = list(range(len(X_train),
  295. len(X_train) + len(X_val)))
  296. self._cv = [(train_inds, val_inds)]
  297. self._X = np.concatenate([X_train, X_val])
  298. self._y = None if y_train is None\
  299. else np.concatenate([y_train, y_val])
  300. else:
  301. self._cv = cv
  302. self._X = X_train
  303. self._y = y_train
  304. self.attached_data = True
  305. self._logger.info("Attached data")
  306. except Exception as e:
  307. err = ("Failed to attach data. "
  308. "Exit with error: {}".format(e))
  309. self._logger.log_and_raise_error(err)
  310. def attach_data_from_hdf5(self,
  311. data_hdf5_store_path: str,
  312. cv_pickle_path: str = None) -> None:
  313. """
  314. Method for attaching data from a hdf5 store
  315. and a cv object from a pickled file.
  316. The hdf5 store is a binary file,
  317. after loading it, it is a dictionary with keys
  318. X_train (y_train, X_val, y_val).
  319. The cv is loaded from a pickle file.
  320. The reason to separate the data
  321. store from the cv store, is the hdf5 is optimized to
  322. store large dataframes (especially with simple types) and
  323. a a small list of lists like a cv-object is better
  324. to be stored as a pickle file.
  325. :param str data_hdf5_store_path: path to the hdf5 store
  326. with train and validation data
  327. :param str cv_pickle_path: path to the pickle file with
  328. the cv data
  329. """
  330. try:
  331. assert(os.path.isfile(data_hdf5_store_path)),\
  332. "Parameter hdf5_store_path is not a file"
  333. # close all opened files, because hdf5 will
  334. # fail to reopen an opened (for some reason) file
  335. import tables
  336. tables.file._open_files.close_all()
  337. store = pd.HDFStore(data_hdf5_store_path)
  338. self._data_path = data_hdf5_store_path
  339. data_input = {key: store[key] if key in store else None
  340. for key in ["X_train", "y_train", "X_val", "y_val"]}
  341. if cv_pickle_path is not None:
  342. assert(os.path.isfile(cv_pickle_path)),\
  343. "Parameter cv_pickle_path is not a file"
  344. data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
  345. self._cv_path = cv_pickle_path
  346. else:
  347. data_input["cv"] = None
  348. self.attach_data(**data_input)
  349. store.close()
  350. except Exception as e:
  351. err = "Failed to attach data. Exit with error: {}".format(e)
  352. self._logger.log_and_raise_error(err)
  353. @property
  354. def default_summary(self) -> dict:
  355. """
  356. Default summary of the strategy.
  357. Every the _objective function is called
  358. the current score and the information
  359. about the tested space element is added to the
  360. summary and it is saved to the Trials.
  361. If summary saving is configured it is also
  362. saved to a file, or a database when the score improves.
  363. """
  364. summary = {}
  365. if self._strategy_name is not None:
  366. summary["strategy_name"] = self._strategy_name
  367. if isinstance(self._cost_func, str):
  368. summary["cost_func"] = self._cost_func
  369. elif hasattr(self._cost_func, "__name__"):
  370. summary["cost_func"] = self._cost_func.__name__
  371. summary["trials_path"] = self.trials_path
  372. if self._data_path is not None:
  373. summary["data_path"] = self._data_path
  374. if self._cv_path is not None:
  375. summary["cv_path"] = self._cv_path
  376. summary["start_tuning_time"] = self.start_tuning_time
  377. summary["iteration"] = self._iteration
  378. return summary
  379. def configer_summary_saving(self,
  380. save_method: Callable
  381. = functools.partial(
  382. pd.DataFrame.to_excel,
  383. **{"path_or_buf": "result.csv"}),
  384. kwargs: dict = None) -> None:
  385. """
  386. When the score calculated by _objective function improves,
  387. the default summary is updated with information about the
  388. current score and pipeline/hyperparameters
  389. and can be saved to a file or database, depending
  390. on the configured save_method.
  391. :param Callable save_method: method for saving the result
  392. of the pipeline selection. The method must accept
  393. a pandas DataFrame as argument.
  394. By default, saving to an excel file.
  395. Examples:
  396. functools.partial(pd.DataFrame.to_csv,
  397. **{"path_or_buf": <PATH>})
  398. functools.partial(np.savetxt, **{"fname": <PATH>})
  399. functools.partial(SQLHandler(<URI>).append_to_table,
  400. **{"tablename": <NAME>})
  401. functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
  402. **{"collection_name": <NAME>})
  403. using functools can be avoided by providing the kwarg argument
  404. :param dict kwargs: a dictionary with keyword arguments
  405. (like tablename) to provide to the save_method
  406. """
  407. try:
  408. kwargs = kwargs or {}
  409. self._save_method = functools.partial(save_method, **kwargs)
  410. self.configured_summary_saving = True
  411. self._logger.info("Configured summary saving")
  412. except Exception as e:
  413. err = ("Failed to configure the summary saving. "
  414. "Exit with error {}".format(e))
  415. self._logger.log_and_raise_error(err)
  416. def _save_summary(self, summary: dict) -> None:
  417. """
  418. When the score calculated by _objective function improves,
  419. the default summary is updated with information about the
  420. current score and pipeline/hyperparameters
  421. and can be saved to a file or database, depending
  422. on the configured save_method.
  423. """
  424. try:
  425. assert(self.configured_summary_saving),\
  426. "Result saving must be configured first"
  427. self._save_method(summary)
  428. except Exception as e:
  429. err = ("Could not configure summary saving. "
  430. "Exit with error: {}".format(e))
  431. self._logger.log_and_raise_error(err)
  432. def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
  433. """
  434. Calculates the averaged cross-validated score and score variance,
  435. as well as the averaged values and variances of the additional metrics.
  436. This method is called in the _objective function that is
  437. passed to the hyperopt optimizer.
  438. This function can be overriden, when the cost
  439. needs to be calculated differently,
  440. for example with a tensorflow model.
  441. :param Pipeline pipeline: machine learning pipeline
  442. that will be evaluated with cross-validation
  443. :return: dictionary with the aggregated
  444. cross-validation scores and
  445. the score variances for the scores in the output
  446. of the cross-validation function.
  447. form of the output:
  448. {"score": 10, #score used in optimization,
  449. "score_variance": 0.5
  450. "additional_metric1": 5,
  451. "additional_metric1_variance": 7}
  452. a custom cross-validation function can also include for
  453. example probability threshold for each fold, then
  454. the output of this function will include the average
  455. value and the variance of the probability threshold
  456. over the folds.
  457. """
  458. try:
  459. scoring = {"score": self._cost_func} | self._additional_metrics
  460. if self._cross_validation_needs_scorer:
  461. for metric_name, metric in scoring.items():
  462. scoring[metric_name] = make_scorer(
  463. metric, greater_is_better=self._greater_is_better)
  464. cross_validation_input_args = {
  465. "estimator": pipeline,
  466. "X": self._X,
  467. "y": self._y,
  468. "cv": self._cv,
  469. "scoring": scoring
  470. }
  471. if "error_score" in self._cross_validation.__annotations__:
  472. cross_validation_input_args["error_score"] = np.nan
  473. scores = self._cross_validation(**cross_validation_input_args)
  474. averaging_funcs = {
  475. metric_name: self._additional_averaging_funcs[metric_name]
  476. if metric_name in self._additional_averaging_funcs
  477. else self._cross_val_averaging_func
  478. for metric_name in scores}
  479. scores_average = {
  480. metric_name.replace("test_", ""):
  481. averaging_funcs[metric_name](scores[metric_name])
  482. for metric_name in scores
  483. if metric_name.startswith("test")}
  484. scores_variance = {
  485. metric_name.replace("test_", "") + "_variance":
  486. np.var(scores[metric_name])
  487. for metric_name in scores
  488. if metric_name.startswith("test")}
  489. return {**scores_average, **scores_variance}
  490. except Exception as e:
  491. err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
  492. self._logger.log_and_raise_error(err)
  493. def _objective(self, space_element: SpaceElementType) -> dict:
  494. '''
  495. This method is called in run_trials method
  496. that is using the hyperopt fmin opmizer.
  497. Uses _evaluate method.
  498. It must take as input a space element
  499. and produce an output in the form of dictionary
  500. with 2 obligatory values loss and status
  501. (STATUS_OK or STATUS_FAIL). Other
  502. values in the output are optional and can be
  503. accessed later through the trials object.
  504. :Warning: fmin minimizes the loss,
  505. when _evaluate returns a value to be maximized,
  506. it is multiplied by -1 to obtain loss.
  507. :param SpaceElementType space_element: element
  508. of the space over which the optimization is done
  509. :output: dictionary with keys
  510. loss (minimized value),
  511. status with values STATUS_OK or STATUS_FAIL
  512. uderstood by hyperopt,
  513. score (equal to loss or -loss),
  514. score_variance,
  515. timestamp (end of execution),
  516. train_time: execution time
  517. and other keys given in self.default_summary
  518. '''
  519. try:
  520. start_time = time.time()
  521. assert(self.attached_data),\
  522. ("Data must be attached in order "
  523. "in order to effectuate the best"
  524. "pipeline search")
  525. summary = deepcopy(self.default_summary)
  526. # backup the current trials if the score improved
  527. # at previous iteration or every ith iteration
  528. # if the backup_trials_freq is set
  529. backup_cond = ((self._backup_trials_freq is not None) and
  530. ((self._iteration - self._start_iteration - 1) %
  531. self._backup_trials_freq == 0)) or\
  532. self._score_improved
  533. if backup_cond:
  534. self._backup_trials()
  535. self._score_improved = False
  536. pipeline = space_element['pipeline']
  537. params = space_element['params']
  538. pipeline.set_params(**params)
  539. self._logger.info(("Iteration {0}: "
  540. "Current score is {1}: "
  541. "Training pipeline {2} "
  542. "with parameters: {3}. ").format(
  543. self._iteration,
  544. self.best_score,
  545. space_element['name'],
  546. params))
  547. result = self._evaluate(pipeline)
  548. summary.update(result)
  549. end_time = time.time()
  550. summary['status'] = STATUS_OK
  551. summary.update(result)
  552. summary['loss'] = self._score_factor * summary['score']
  553. summary['timestamp'] = datetime.datetime.today()
  554. summary['train_time'] = end_time - start_time
  555. self._iteration += 1
  556. self._score_improved = (self.best_score != self.best_score) or\
  557. (self._score_factor*result["score"] <
  558. self._score_factor*self.best_score)
  559. if self._score_improved:
  560. self._logger.info("Score improved, new best score is: {}"
  561. .format(result["score"]))
  562. self.best_score = result['score']
  563. if self.configured_summary_saving:
  564. self._save_summary(summary)
  565. except Exception as e:
  566. self._logger.warning("Trial failed with error {}".format(e))
  567. summary = {}
  568. summary['status'] = STATUS_FAIL
  569. summary['timestamp'] = datetime.datetime.today()
  570. summary['error'] = e
  571. for key in ['loss', 'score', 'score_variance', 'train_time']:
  572. summary[key] = np.nan
  573. return summary
  574. @abstractmethod
  575. def run_trials(self):
  576. """
  577. Method that runs the hyperparameter tuning over possibly multiple
  578. pipeline types specified in self.space
  579. When run_trials method is finished the flag self.finished_tuning
  580. should be set to True and the methods self._backup_trials and
  581. optionally self._save_result should be called.
  582. """
  583. pass
  584. @abstractproperty
  585. def number_of_trials(self) -> int:
  586. """
  587. Number of trials already run in the current trials object
  588. """
  589. pass
  590. @abstractproperty
  591. def best_trial(self) -> dict:
  592. """
  593. Best trial sor far.
  594. Should contain the status, pipeline,
  595. hyperparameters, and the score (loss).
  596. Other information is otional and is defined
  597. by self.default_summary
  598. """
  599. pass
  600. @abstractproperty
  601. def best_trial_score(self) -> float:
  602. """
  603. Score of the best pipeline with the best hyperparameters
  604. """
  605. pass
  606. @abstractproperty
  607. def best_trial_score_variance(self) -> float:
  608. """
  609. Variance of the cross-validation score of the best pipeline
  610. """
  611. pass
  612. @abstractproperty
  613. def best_trial_pipeline(self) -> Pipeline:
  614. """
  615. Best pipeline with best hyperparameters
  616. """
  617. pass
  618. @abstractmethod
  619. def get_n_best_trial_pipelines(self, n: int) -> list:
  620. """
  621. N best pipelines with corresponding
  622. best hyperparameters
  623. """
  624. pass
  625. @abstractmethod
  626. def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
  627. """
  628. If the hyperparameter search is done over multiple
  629. pipelines, then returns n different pipeline-types
  630. with corresponding hyperparameters
  631. """
  632. pass
  633. @abstractmethod
  634. def trials_to_excel(self, path: str) -> None:
  635. """
  636. Trials object in the shape of table written to excel,
  637. should contain the iteration, pipeline (as str),
  638. hyperparamters (as str), self.best_result (see self._objective method)
  639. as well as additional information defined by self.default_summary
  640. """
  641. pass