PipelineSelector.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Sep 30 14:23:23 2020
  5. @author: tanya
  6. @description: an abstract class for selecting a machine learning
  7. pipeline from a space (deterministic or random) of parameter distributions
  8. over multiple pipelines.
  9. The selection is thought in such a way that a Trials object is being
  10. maintained during the tuning process from which one can retrieve
  11. the best pipeline so far
  12. as well as the entire tuning history if needed.
  13. Methods configure_cross_validation and configure_result_saving
  14. allow to use a custom cross-validation method and
  15. save the current best result in a file or database during training.
  16. Children classes: hyperopt and custom gridsearch.
  17. """
  18. import pickle
  19. import os
  20. import sys
  21. import time
  22. import datetime
  23. import numpy as np
  24. import pandas as pd
  25. from copy import deepcopy
  26. from abc import ABC, abstractmethod, abstractproperty
  27. if (sys.version_info.major == 3) & (sys.version_info.minor >= 8):
  28. print("I have python version {}.{} and will import typing".format(sys.version_info.major, sys.version_info.minor))
  29. from typing import Callable, TypedDict,\
  30. Literal, Dict, Iterable, List, Tuple, Union
  31. else:
  32. # from typing_extensions import *
  33. print("I have python version {}.{} and will import typing_extensions".format(sys.version_info.major, sys.version_info.minor))
  34. from typing_extensions import TypedDict
  35. import functools
  36. from sklearn.pipeline import Pipeline
  37. from sklearn.model_selection import cross_validate as sklearn_cross_validation
  38. from sklearn.metrics import make_scorer
  39. from hyperopt import STATUS_OK, STATUS_FAIL
  40. from cdplib.log import Log
  41. from cdplib.utils.ExceptionsHandler import ExceptionsHandler
  42. from cdplib.utils import LoadingUtils
  43. from cdplib.ml_validation import CVComposer
  44. sys.path.append(os.getcwd())
  45. class SpaceElementType(TypedDict):
  46. name: str
  47. pipeline: Pipeline
  48. params: dict
  49. # TODO Tanya: add possibility to include confusion matrix in
  50. # additional metrics
  51. # check that cv object contains indices
  52. class PipelineSelector(ABC):
  53. """
  54. An abstract class for selecting a machine learning
  55. pipeline from a space (deterministic or random) of parameter
  56. distributions over multiple pipelines.
  57. The selection is though in such a way that a Trials object is being
  58. maintained during the tuning process from which one can retrieve
  59. the best pipeline so far as well as the entire tuning history
  60. if needed.
  61. Methods configure_cross_validation and configure_result_saving
  62. allow to use a custom cross-validation method and
  63. save the current best result in a file or database during training.
  64. Children classes: hyperopt and custom gridsearch.
  65. """
  66. def __init__(self,
  67. # cost_func: Union[Callable, str],
  68. cost_func,
  69. greater_is_better: bool,
  70. trials_path: str,
  71. backup_trials_freq: int = None,
  72. cross_validation_needs_scorer: bool = True,
  73. # cross_val_averaging_func: Callable = np.mean,
  74. cross_val_averaging_func = np.mean,
  75. # additional_metrics: Dict[str, Callable] = None,
  76. additional_metrics = {},
  77. # additional_averaging_funcs: Dict[str, Callable] = None,
  78. additional_averaging_funcs = None,
  79. strategy_name: str = None,
  80. # stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
  81. # = "INFO")
  82. stdout_log_level = "INFO"):
  83. """
  84. :param Callable cost_func: function to minimize or maximize
  85. over the elements of a given (pipeline/hyperparameter) space
  86. :param bool greater_is_better: when True
  87. cost_func is maximized, else minimized.
  88. :param str trials_path: path at which the trials object is saved
  89. in binary format. From the trials object we can
  90. select information about the obtained scores, score variations,
  91. and pipelines, and parameters tried out so far. If a trials object
  92. already exists at the given path, it is loaded and the
  93. search is continued, else, the search is started from scratch.
  94. :param backup_trials_freq: frequecy in interations (trials)
  95. of saving the trials object at the trials_path.
  96. if None, the trials object is backed up avery time
  97. the score improves.
  98. :param Callable cross_val_averaging_func: Function to aggregate
  99. the cross-validation scores of the cost_func.
  100. Example different from the mean: mean - c*var.
  101. :param additional_metics: dict of additional metrics to keep track of
  102. in the trials of the form {"metric_name": metric}.
  103. :param additional_averaging_funcs: functions used to aggregate
  104. the output of the cross_validate function.
  105. The output always contains the scores of the cost_func,
  106. additional_metrics (if it is not empty),
  107. but it can also contain additional information
  108. (like probability threshold for example)
  109. if different from cross_val_averaging_func.
  110. Of the form {"metric_name": averaging_func}
  111. Remark:
  112. :param str strategy_name:
  113. a strategy is defined by the data set (columns/features and rows),
  114. cv object, cost function.
  115. When the strategy changes, one must start with new trials.
  116. :param str stdout_log_level: can be INFO, WARNING, ERROR
  117. """
  118. self._logger = Log("PipelineSelector: ",
  119. stdout_log_level=stdout_log_level)
  120. try:
  121. ExceptionsHandler(self._logger)\
  122. .assert_is_directory(path=trials_path)
  123. self.attached_space = False
  124. self.attached_data = False
  125. self.configured_cross_validation = False
  126. self.configured_summary_saving = False
  127. self._cost_func = cost_func
  128. self._greater_is_better = greater_is_better
  129. # score factor is 1 when cost_func is minimized,
  130. # -1 when cost func is maximized
  131. self._score_factor = (not greater_is_better) - greater_is_better
  132. self._cross_val_averaging_func = cross_val_averaging_func
  133. self._additional_metrics = additional_metrics
  134. self._additional_averaging_funcs = additional_averaging_funcs or {}
  135. self.trials_path = trials_path
  136. self._backup_trials_freq = backup_trials_freq
  137. self._strategy_name = strategy_name
  138. self._data_path = None
  139. self._cv_path = None
  140. self._X = None
  141. self._y = None
  142. self._cv = None
  143. self._space = None
  144. # if cross-valition is not configured,
  145. # sklearn cross-validation method is taken by default
  146. self._cross_validation = sklearn_cross_validation
  147. self._cross_validation_needs_scorer = cross_validation_needs_scorer
  148. # if a trials object already exists at the given path,
  149. # it is loaded and the search is continued. Else,
  150. # the search is started from the beginning.
  151. if os.path.isfile(self.trials_path):
  152. with open(self.trials_path, "rb") as f:
  153. self._trials = pickle.load(f)
  154. if len(self._trials) == 0:
  155. self._trials = None
  156. else:
  157. self._trials = None
  158. if self._trials is not None:
  159. self._start_iteration = self.number_of_trials
  160. self.best_score = self.best_trial_score
  161. self._logger.info(("Loaded an existing trials object"
  162. "Consisting of {} trials")
  163. .format(self._start_iteration))
  164. else:
  165. self._logger.warning(("No existing trials object was found, "
  166. "Starting from scratch."))
  167. self._trials = None
  168. self._start_iteration = 0
  169. self.best_score = np.nan
  170. # keeping track of the current search iteration
  171. self._iteration = self._start_iteration
  172. self._score_improved = False
  173. self.start_tuning_time = datetime.datetime.today()
  174. self.total_tuning_time = None
  175. self.finished_tuning = False
  176. except Exception as e:
  177. err = ("Failed to initialize the class. "
  178. "Exit with error: {}".format(e))
  179. self._logger.log_and_raise_error(err)
  180. def _backup_trials(self) -> None:
  181. '''
  182. Pickles (Saves) the trials object in binary format.
  183. '''
  184. try:
  185. with open(self.trials_path, "wb") as f:
  186. pickle.dump(self._trials, f)
  187. except Exception as e:
  188. err = "Could not backup trials. Exit with error: {}".format(e)
  189. self._logger.log_and_raise_error(err)
  190. def configure_cross_validation(self,
  191. # cross_validation: Callable,
  192. cross_validation,
  193. kwargs: dict = None) -> None:
  194. """
  195. Method for attaching a custom cross-validation function
  196. :param cross_validation: a function that has the same
  197. signature as sklearn.model_selection.cross_validate
  198. """
  199. try:
  200. kwargs = kwargs or {}
  201. self._cross_validation = functools.partial(
  202. cross_validation, **kwargs)
  203. self.configured_cross_validation = True
  204. self._logger.info("Configured cross validation")
  205. except Exception as e:
  206. err = ("Failed to configure cross-validation. "
  207. "Exit with error: {}".format(e))
  208. self._logger.log_and_raise_error(err)
  209. def configure_cross_validation_from_module(self,
  210. module_path: str,
  211. name: str) -> None:
  212. """
  213. Attaches a cross-validation funciton defined in
  214. a different python model. This function must have
  215. the same signature as sklearn.model_seclection.cross_validate
  216. :param str module_path: path to python module
  217. where the cross_validation function is defined.
  218. :param str name: name of the cross validation function
  219. loaded froma python module.
  220. """
  221. try:
  222. self._cross_validation = \
  223. LoadingUtils().load_from_module(
  224. module_path=module_path, name=name)
  225. self.configured_cross_validation = True
  226. self._logger.info("Configured cross validation")
  227. except Exception as e:
  228. err = ("Failed to load cross-validation from module. "
  229. "Exit with error: {}".format(e))
  230. self._logger.log_and_raise_error(err)
  231. def attach_space(self, space) -> None:
  232. """
  233. Method for attaching the pipeline/hyperparameter space
  234. over which the score_func is optimized.
  235. :param space: space where
  236. the search is performed. A space might be either
  237. a list of dictionaries or a hyperopt space object
  238. the elements of which are dictionaries with keys:
  239. name, pipeline, params
  240. """
  241. try:
  242. self._space = space
  243. self.attached_space = True
  244. self._logger.info("Attached parameter distribution space")
  245. except Exception as e:
  246. err = ("Failed to attach space. "
  247. "Exit with error: {}".format(e))
  248. self._logger.log_and_raise_error(err)
  249. def attach_space_from_module(self, module_path: str, name: str) -> None:
  250. """
  251. Attaches a space defined in a different python module.
  252. :param str module_path: path to python module
  253. where the space is defined.
  254. :param str name: name of the space loaded from
  255. a python module.
  256. """
  257. try:
  258. self._space = LoadingUtils().load_from_module(
  259. module_path=module_path, name=name)
  260. self.attached_space = True
  261. self._logger.info("Attached parameter distribution space")
  262. except Exception as e:
  263. err = ("Failed to attach space from module. "
  264. "Exit with error {}".format(e))
  265. self._logger.loger_and_raise_error(err)
  266. def attach_data(self,
  267. # X_train: Union[pd.DataFrame, np.ndarray],
  268. # y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
  269. # = None,
  270. # X_val: Union[pd.DataFrame, np.ndarray]
  271. # = None,
  272. # y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
  273. # = None,
  274. # cv: Union[Iterable[Tuple[List[int], List[int]]]]
  275. # = None
  276. X_train,
  277. y_train = None,
  278. X_val = None,
  279. y_val = None,
  280. cv = None) -> None:
  281. '''
  282. :param array X_train: data on which
  283. machine learning pipelines are trained
  284. :param array y_train: optional, vector with targets,
  285. (None in case of unsupervided learning)
  286. :param array X_val: optional, validation data.
  287. When not provided, cross-validated value
  288. of the cost_func is calculated.
  289. :param array y_val: optional, validation targets
  290. :param list cv: iterabe of tuples containing
  291. train and validation indices or an integer representing
  292. the number of folds for a random split of data
  293. during cross-validation
  294. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  295. '''
  296. try:
  297. assert((cv is None) == (X_val is not None)),\
  298. "Either cv or X_val must be provided"
  299. if cv is None:
  300. assert((y_val is None) == (y_train is None)),\
  301. "y_train and y_val must be simultanious"
  302. # Here we create a trivial cv object
  303. # with one validation split.
  304. # XXX Tanya finish here
  305. cv = CVComposer.dummy_cv()
  306. train_inds = list(range(len(X_train)))
  307. val_inds = list(range(len(X_train),
  308. len(X_train) + len(X_val)))
  309. self._cv = [(train_inds, val_inds)]
  310. self._X = np.concatenate([X_train, X_val])
  311. self._y = None if y_train is None\
  312. else np.concatenate([y_train, y_val])
  313. else:
  314. self._cv = cv
  315. self._X = X_train
  316. self._y = y_train
  317. self.attached_data = True
  318. self._logger.info("Attached data")
  319. except Exception as e:
  320. err = ("Failed to attach data. "
  321. "Exit with error: {}".format(e))
  322. self._logger.log_and_raise_error(err)
  323. def attach_data_from_hdf5(self,
  324. data_hdf5_store_path: str,
  325. cv_pickle_path: str = None) -> None:
  326. """
  327. Method for attaching data from a hdf5 store
  328. and a cv object from a pickled file.
  329. The hdf5 store is a binary file,
  330. after loading it, it is a dictionary with keys
  331. X_train (y_train, X_val, y_val).
  332. The cv is loaded from a pickle file.
  333. The reason to separate the data
  334. store from the cv store, is the hdf5 is optimized to
  335. store large dataframes (especially with simple types) and
  336. a a small list of lists like a cv-object is better
  337. to be stored as a pickle file.
  338. :param str data_hdf5_store_path: path to the hdf5 store
  339. with train and validation data
  340. :param str cv_pickle_path: path to the pickle file with
  341. the cv data
  342. """
  343. try:
  344. assert(os.path.isfile(data_hdf5_store_path)),\
  345. "Parameter hdf5_store_path is not a file"
  346. # close all opened files, because hdf5 will
  347. # fail to reopen an opened (for some reason) file
  348. import tables
  349. tables.file._open_files.close_all()
  350. store = pd.HDFStore(data_hdf5_store_path)
  351. self._data_path = data_hdf5_store_path
  352. data_input = {key: store[key] if key in store else None
  353. for key in ["X_train", "y_train", "X_val", "y_val"]}
  354. if cv_pickle_path is not None:
  355. assert(os.path.isfile(cv_pickle_path)),\
  356. "Parameter cv_pickle_path is not a file"
  357. data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
  358. self._cv_path = cv_pickle_path
  359. else:
  360. data_input["cv"] = None
  361. self.attach_data(**data_input)
  362. store.close()
  363. except Exception as e:
  364. err = "Failed to attach data. Exit with error: {}".format(e)
  365. self._logger.log_and_raise_error(err)
  366. @property
  367. def default_summary(self) -> dict:
  368. """
  369. Default summary of the strategy.
  370. Every the _objective function is called
  371. the current score and the information
  372. about the tested space element is added to the
  373. summary and it is saved to the Trials.
  374. If summary saving is configured it is also
  375. saved to a file, or a database when the score improves.
  376. """
  377. summary = {}
  378. if self._strategy_name is not None:
  379. summary["strategy_name"] = self._strategy_name
  380. if isinstance(self._cost_func, str):
  381. summary["cost_func"] = self._cost_func
  382. elif hasattr(self._cost_func, "__name__"):
  383. summary["cost_func"] = self._cost_func.__name__
  384. summary["trials_path"] = self.trials_path
  385. if self._data_path is not None:
  386. summary["data_path"] = self._data_path
  387. if self._cv_path is not None:
  388. summary["cv_path"] = self._cv_path
  389. summary["start_tuning_time"] = self.start_tuning_time
  390. summary["iteration"] = self._iteration
  391. return summary
  392. def configer_summary_saving(self,
  393. # save_method: Callable
  394. save_method
  395. = functools.partial(
  396. pd.DataFrame.to_excel,
  397. **{"path_or_buf": "result.csv"}),
  398. kwargs: dict = None) -> None:
  399. """
  400. When the score calculated by _objective function improves,
  401. the default summary is updated with information about the
  402. current score and pipeline/hyperparameters
  403. and can be saved to a file or database, depending
  404. on the configured save_method.
  405. :param Callable save_method: method for saving the result
  406. of the pipeline selection. The method must accept
  407. a pandas DataFrame as argument.
  408. By default, saving to an excel file.
  409. Examples:
  410. functools.partial(pd.DataFrame.to_csv,
  411. **{"path_or_buf": <PATH>})
  412. functools.partial(np.savetxt, **{"fname": <PATH>})
  413. functools.partial(SQLHandler(<URI>).append_to_table,
  414. **{"tablename": <NAME>})
  415. functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
  416. **{"collection_name": <NAME>})
  417. using functools can be avoided by providing the kwarg argument
  418. :param dict kwargs: a dictionary with keyword arguments
  419. (like tablename) to provide to the save_method
  420. """
  421. try:
  422. kwargs = kwargs or {}
  423. self._save_method = functools.partial(save_method, **kwargs)
  424. self.configured_summary_saving = True
  425. self._logger.info("Configured summary saving")
  426. except Exception as e:
  427. err = ("Failed to configure the summary saving. "
  428. "Exit with error {}".format(e))
  429. self._logger.log_and_raise_error(err)
  430. def _save_summary(self, summary: dict) -> None:
  431. """
  432. When the score calculated by _objective function improves,
  433. the default summary is updated with information about the
  434. current score and pipeline/hyperparameters
  435. and can be saved to a file or database, depending
  436. on the configured save_method.
  437. """
  438. try:
  439. assert(self.configured_summary_saving),\
  440. "Result saving must be configured first"
  441. self._save_method(summary)
  442. except Exception as e:
  443. err = ("Could not configure summary saving. "
  444. "Exit with error: {}".format(e))
  445. self._logger.log_and_raise_error(err)
  446. def _evaluate(self, pipeline: Pipeline) :#-> Union[Dict[str, float], None]:
  447. """
  448. Calculates the averaged cross-validated score and score variance,
  449. as well as the averaged values and variances of the additional metrics.
  450. This method is called in the _objective function that is
  451. passed to the hyperopt optimizer.
  452. This function can be overriden, when the cost
  453. needs to be calculated differently,
  454. for example with a tensorflow model.
  455. :param Pipeline pipeline: machine learning pipeline
  456. that will be evaluated with cross-validation
  457. :return: dictionary with the aggregated
  458. cross-validation scores and
  459. the score variances for the scores in the output
  460. of the cross-validation function.
  461. form of the output:
  462. {"score": 10, #score used in optimization,
  463. "score_variance": 0.5
  464. "additional_metric1": 5,
  465. "additional_metric1_variance": 7}
  466. a custom cross-validation function can also include for
  467. example probability threshold for each fold, then
  468. the output of this function will include the average
  469. value and the variance of the probability threshold
  470. over the folds.
  471. """
  472. try:
  473. # scoring = {"score": self._cost_func} | self._additional_metrics
  474. scoring = {"score": self._cost_func, **self._additional_metrics}
  475. if self._cross_validation_needs_scorer:
  476. for metric_name, metric in scoring.items():
  477. scoring[metric_name] = make_scorer(
  478. metric, greater_is_better=self._greater_is_better)
  479. cross_validation_input_args = {
  480. "estimator": pipeline,
  481. "X": self._X,
  482. "y": self._y,
  483. "cv": self._cv,
  484. "scoring": scoring
  485. }
  486. if "error_score" in self._cross_validation.__annotations__:
  487. cross_validation_input_args["error_score"] = np.nan
  488. scores = self._cross_validation(**cross_validation_input_args)
  489. averaging_funcs = {
  490. metric_name: self._additional_averaging_funcs[metric_name]
  491. if metric_name in self._additional_averaging_funcs
  492. else self._cross_val_averaging_func
  493. for metric_name in scores}
  494. scores_average = {
  495. metric_name.replace("test_", ""):
  496. averaging_funcs[metric_name](scores[metric_name])
  497. for metric_name in scores
  498. if metric_name.startswith("test")}
  499. scores_variance = {
  500. metric_name.replace("test_", "") + "_variance":
  501. np.var(scores[metric_name])
  502. for metric_name in scores
  503. if metric_name.startswith("test")}
  504. return {**scores_average, **scores_variance}
  505. except Exception as e:
  506. err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
  507. self._logger.log_and_raise_error(err)
  508. def _objective(self, space_element: SpaceElementType) -> dict:
  509. '''
  510. This method is called in run_trials method
  511. that is using the hyperopt fmin opmizer.
  512. Uses _evaluate method.
  513. It must take as input a space element
  514. and produce an output in the form of dictionary
  515. with 2 obligatory values loss and status
  516. (STATUS_OK or STATUS_FAIL). Other
  517. values in the output are optional and can be
  518. accessed later through the trials object.
  519. :Warning: fmin minimizes the loss,
  520. when _evaluate returns a value to be maximized,
  521. it is multiplied by -1 to obtain loss.
  522. :param SpaceElementType space_element: element
  523. of the space over which the optimization is done
  524. :output: dictionary with keys
  525. loss (minimized value),
  526. status with values STATUS_OK or STATUS_FAIL
  527. uderstood by hyperopt,
  528. score (equal to loss or -loss),
  529. score_variance,
  530. timestamp (end of execution),
  531. train_time: execution time
  532. and other keys given in self.default_summary
  533. '''
  534. try:
  535. start_time = time.time()
  536. assert(self.attached_data),\
  537. ("Data must be attached in order "
  538. "in order to effectuate the best"
  539. "pipeline search")
  540. summary = deepcopy(self.default_summary)
  541. # backup the current trials if the score improved
  542. # at previous iteration or every ith iteration
  543. # if the backup_trials_freq is set
  544. backup_cond = ((self._backup_trials_freq is not None) and
  545. ((self._iteration - self._start_iteration - 1) %
  546. self._backup_trials_freq == 0)) or\
  547. self._score_improved
  548. if backup_cond:
  549. self._backup_trials()
  550. self._score_improved = False
  551. pipeline = space_element['pipeline']
  552. params = space_element['params']
  553. pipeline.set_params(**params)
  554. self._logger.info(("Iteration {0}: "
  555. "Current score is {1}: "
  556. "Training pipeline {2} "
  557. "with parameters: {3}. ").format(
  558. self._iteration,
  559. self.best_score,
  560. space_element['name'],
  561. params))
  562. result = self._evaluate(pipeline)
  563. summary.update(result)
  564. end_time = time.time()
  565. summary['status'] = STATUS_OK
  566. summary.update(result)
  567. summary['loss'] = self._score_factor * summary['score']
  568. summary['timestamp'] = datetime.datetime.today()
  569. summary['train_time'] = end_time - start_time
  570. self._iteration += 1
  571. self._score_improved = (self.best_score != self.best_score) or\
  572. (self._score_factor*result["score"] <
  573. self._score_factor*self.best_score)
  574. if self._score_improved:
  575. self._logger.info("Score improved, new best score is: {}"
  576. .format(result["score"]))
  577. self.best_score = result['score']
  578. if self.configured_summary_saving:
  579. self._save_summary(summary)
  580. except Exception as e:
  581. self._logger.warning("Trial failed with error {}".format(e))
  582. summary = {}
  583. summary['status'] = STATUS_FAIL
  584. summary['timestamp'] = datetime.datetime.today()
  585. summary['error'] = e
  586. for key in ['loss', 'score', 'score_variance', 'train_time']:
  587. summary[key] = np.nan
  588. return summary
  589. @abstractmethod
  590. def run_trials(self):
  591. """
  592. Method that runs the hyperparameter tuning over possibly multiple
  593. pipeline types specified in self.space
  594. When run_trials method is finished the flag self.finished_tuning
  595. should be set to True and the methods self._backup_trials and
  596. optionally self._save_result should be called.
  597. """
  598. pass
  599. @abstractproperty
  600. def number_of_trials(self) -> int:
  601. """
  602. Number of trials already run in the current trials object
  603. """
  604. pass
  605. @abstractproperty
  606. def best_trial(self) -> dict:
  607. """
  608. Best trial sor far.
  609. Should contain the status, pipeline,
  610. hyperparameters, and the score (loss).
  611. Other information is otional and is defined
  612. by self.default_summary
  613. """
  614. pass
  615. @abstractproperty
  616. def best_trial_score(self) -> float:
  617. """
  618. Score of the best pipeline with the best hyperparameters
  619. """
  620. pass
  621. @abstractproperty
  622. def best_trial_score_variance(self) -> float:
  623. """
  624. Variance of the cross-validation score of the best pipeline
  625. """
  626. pass
  627. @abstractproperty
  628. def best_trial_pipeline(self) -> Pipeline:
  629. """
  630. Best pipeline with best hyperparameters
  631. """
  632. pass
  633. @abstractmethod
  634. def get_n_best_trial_pipelines(self, n: int) -> list:
  635. """
  636. N best pipelines with corresponding
  637. best hyperparameters
  638. """
  639. pass
  640. @abstractmethod
  641. def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
  642. """
  643. If the hyperparameter search is done over multiple
  644. pipelines, then returns n different pipeline-types
  645. with corresponding hyperparameters
  646. """
  647. pass
  648. @abstractmethod
  649. def trials_to_excel(self, path: str) -> None:
  650. """
  651. Trials object in the shape of table written to excel,
  652. should contain the iteration, pipeline (as str),
  653. hyperparamters (as str), self.best_result (see self._objective method)
  654. as well as additional information defined by self.default_summary
  655. """
  656. pass