PipelineSelector.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Sep 30 14:23:23 2020
  5. @author: tanya
  6. @description: an abstract class for selecting a machine learning
  7. pipeline from a space (deterministic or random) of parameter distributions
  8. over multiple pipelines.
  9. The selection is thought in such a way that a Trials object is being
  10. maintained during the tuning process from which one can retrieve
  11. the best pipeline so far
  12. as well as the entire tuning history if needed.
  13. Methods configure_cross_validation and configure_result_saving
  14. allow to use a custom cross-validation method and
  15. save the current best result in a file or database during training.
  16. Children classes: hyperopt and custom gridsearch.
  17. """
  18. import pickle
  19. import os
  20. import sys
  21. import time
  22. import datetime
  23. import numpy as np
  24. import pandas as pd
  25. from abc import ABC, abstractmethod, abstractproperty
  26. from typing import Callable
  27. import functools
  28. from sklearn.pipeline import Pipeline
  29. from sklearn.model_selection import cross_validate as sklearn_cross_validation
  30. from sklearn.metrics import make_scorer
  31. from hyperopt import STATUS_OK, STATUS_FAIL
  32. from cdplib.log import Log
  33. from cdplib.utils import ExceptionsHandler
  34. from cdplib.utils import LoadingUtils
  35. sys.path.append(os.getcwd())
  36. class PipelineSelector(ABC):
  37. """
  38. An abstract class for selecting a machine learning
  39. pipeline from a space (deterministic or random) of parameter
  40. distributions over multiple pipelines.
  41. The selection is though in such a way that a Trials object is being
  42. maintained during the tuning process from which one can retrieve
  43. the best pipeline so far as well as the entire tuning history
  44. if needed.
  45. Methods configure_cross_validation and configure_result_saving
  46. allow to use a custom cross-validation method and
  47. save the current best result in a file or database during training.
  48. Children classes: hyperopt and custom gridsearch.
  49. """
  50. def __init__(self,
  51. cost_func: (Callable, str),
  52. greater_is_better: bool,
  53. trials_path: str,
  54. backup_trials_freq: int = None,
  55. cross_val_averaging_func: Callable = None,
  56. additional_metrics: dict = None,
  57. strategy_name: str = None,
  58. stdout_log_level: str = "INFO"):
  59. """
  60. :param Callable cost_func: function to minimize or maximize
  61. :param bool greater_is_better: when True
  62. cost_func is maximized, else minimized.
  63. :param str trials_path: path at which the trials object is saved
  64. in binary format. From the trials object we can
  65. select information about the obtained scores, score variations,
  66. and pipelines, and parameters tried out so far. If a trials object
  67. already exists at the given path, it is loaded and the
  68. search is continued, else, the search is started from
  69. the beginning.
  70. :param backup_trials_freq: frequecy in interations (trials)
  71. of saving the trials object at the trials_path.
  72. if None, the trials object is backed up avery time
  73. the score improves.
  74. :param str log_path: Optional, when not provided logs to stdout.
  75. :param Callable cross_val_averaging_func: optional,
  76. when not provided set to mean. Function
  77. to aggregate the cross-validated values of the cost function.
  78. Classic situation is to take the mean,
  79. another example is, for example mean() - c*var().
  80. :param additional_metics: dict of additional metrics to save
  81. of the form {"metric_name": metric} where metric is a Callable.
  82. :param str strategy_name: a name might be asigned to the trials,
  83. a strategy is defined by the data set, cv object, cost function.
  84. When the strategy changes, one should start with new trials.
  85. :param str stdout_log_level: can be INFO, WARNING, ERROR
  86. """
  87. self._logger = Log("PipelineSelector: ",
  88. stdout_log_level=stdout_log_level)
  89. input_errors = [(cost_func, Callable,
  90. "Parameter 'cost_func' must be a Callable"),
  91. (greater_is_better, bool,
  92. "Parameter 'greater_is_better' must be bool type"),
  93. (trials_path, str,
  94. "Parameter 'trials_path' must be of string type"),
  95. (cross_val_averaging_func, (Callable, None.__class__),
  96. ("Parameter 'cross_val_averaging_func'"
  97. "must be a Callable")),
  98. (backup_trials_freq, (int, None.__class__),
  99. "Parameter backup_trials_freq must be an int"),
  100. (additional_metrics, (dict, None.__class__),
  101. "Parameter additional_metrics must be a dict"),
  102. (strategy_name, (str, None.__class__),
  103. "Parameter strategy_name must be a str"),
  104. (stdout_log_level, str,
  105. "Parameter stdout_log_level must be a str")]
  106. for p, t, err in input_errors:
  107. try:
  108. assert(isinstance(p, t))
  109. except AssertionError:
  110. self._logger.log_and_raise_error(err, ErrorType=NameError)
  111. try:
  112. assert((additional_metrics is None) or
  113. all([isinstance(metric, Callable)
  114. for metric in additional_metrics.values()]))
  115. except AssertionError:
  116. err = "Metrics in additional_metrics must be Callables"
  117. self._logger.log_and_raise_error(err, ErrorType=NameError)
  118. ExceptionsHandler(self._logger).assert_is_directory(path=trials_path)
  119. self._cost_func = cost_func
  120. # score factor is 1 when cost_func is minimized,
  121. # -1 when cost func is maximized
  122. self._score_factor = (not greater_is_better) - greater_is_better
  123. self.trials_path = trials_path
  124. self._backup_trials_freq = backup_trials_freq
  125. self._cross_val_averaging_func = cross_val_averaging_func or np.mean
  126. self._additional_metrics = additional_metrics or {}
  127. self._strategy_name = strategy_name
  128. self._data_path = None
  129. self._cv_path = None
  130. # best_score can be also read from trials
  131. # but is kept explicitely in order not to
  132. # search through the trials object every time
  133. # loss is the opposite of score
  134. self.best_score = np.nan
  135. self._cross_validation = sklearn_cross_validation
  136. # if a trials object already exists at the given path,
  137. # it is loaded and the search is continued. Else,
  138. # the search is started from the beginning.
  139. if os.path.isfile(self.trials_path):
  140. try:
  141. with open(self.trials_path, "rb") as f:
  142. self._trials = pickle.load(f)
  143. self._start_iteration = self.number_of_trials
  144. self.best_score = self.best_trial_score
  145. self._logger.info(("Loaded an existing trials object"
  146. "Consisting of {} trials")
  147. .format(self._start_iteration))
  148. except Exception as e:
  149. err = ("Trials object could not be loaded. "
  150. "Exit with error {}").format(e)
  151. self._logger.log_and_raise_error(err)
  152. self._trials = None
  153. else:
  154. self._logger.warning(("No existing trials object was found, "
  155. "Starting from scratch."))
  156. self._trials = None
  157. self._start_iteration = 0
  158. self.attached_space = False
  159. self.attached_data = False
  160. self.configured_cross_validation = False
  161. self.configured_summary_saving = False
  162. # keeping track of the current search iteration
  163. self._iteration = self._start_iteration
  164. self._score_improved = False
  165. self.start_tuning_time = datetime.datetime.today()
  166. self.end_tuning_time = None
  167. self.finished_tuning = False
  168. def _backup_trials(self):
  169. '''
  170. Pickles (Saves) the trials object.
  171. Used in a scheduler.
  172. '''
  173. try:
  174. with open(self.trials_path, "wb") as f:
  175. pickle.dump(self._trials, f)
  176. except Exception as e:
  177. err = "Could not backup trials. Exit with error: {}".format(e)
  178. self._logger.log_and_raise_error(err)
  179. def configure_cross_validation(self,
  180. cross_validation: Callable,
  181. kwargs: dict = None):
  182. """
  183. Method for attaching a custom cross-validation function
  184. :param cross_validation: a function that has the same
  185. signature as sklearn.model_selection.cross_validate
  186. """
  187. try:
  188. assert(isinstance(cross_validation, Callable))
  189. except AssertionError:
  190. err = "Parameter cross_validation must be a function"
  191. self._logger.log_and_raise_error(err, ErrorType=NameError)
  192. try:
  193. kwargs = kwargs or {}
  194. assert(isinstance(kwargs, dict))
  195. except AssertionError:
  196. err = "Paramter kwargs must be a dict"
  197. self._logger.log_and_raise_error(err, ErrorType=NameError)
  198. try:
  199. self._cross_validation = functools.partial(
  200. self._cross_validation, **kwargs)
  201. self.configured_cross_validation = True
  202. if hasattr(cross_validation, "__name__"):
  203. self.best_result["cross_validation"] =\
  204. cross_validation.__name__
  205. self._logger.info("Configured cross validation")
  206. except Exception as e:
  207. err = ("Failed to configure cross-validation. "
  208. "Exit with error: {}".format(e))
  209. self._logger.log_and_raise_error(err)
  210. def configure_cross_validation_from_module(self,
  211. module_path: str,
  212. name: str):
  213. """
  214. :param str module_path: path to python module
  215. where the cross_validation function is defined.
  216. :param str name: name of the cross validation function
  217. loaded froma python module.
  218. """
  219. try:
  220. assert(isinstance(module_path, str) and
  221. isinstance(name, str))
  222. except AssertionError:
  223. err = "Parameters module_path and name must be of str type"
  224. self._logger.log_and_raise_error(err, ErrorType=NameError)
  225. try:
  226. self._cross_validation = \
  227. LoadingUtils().load_from_module(
  228. module_path=module_path, name=name)
  229. self.configured_cross_validation = True
  230. self.best_result["cross_validation"] = name
  231. self._logger.info("Configured cross validation")
  232. except Exception as e:
  233. err = ("Failed to load cross-validation from module. "
  234. "Exit with error: {}".format(e))
  235. self._logger.log_and_raise_error(e)
  236. def attach_space(self, space):
  237. """
  238. :param space: space where
  239. the search is performed. A space might be either
  240. a list of dictionaries or a hyperopt space object
  241. the elements of which are dictionaries with keys:
  242. name, pipeline, params
  243. """
  244. self._space = space
  245. self._logger.info("Attached parameter distribution space")
  246. self.attached_space = True
  247. def attach_space_from_module(self, module_path: str, name: str):
  248. """
  249. :param str module_path: path to python module
  250. where the space is defined.
  251. :param str name: name of the space loaded from
  252. a python module.
  253. """
  254. try:
  255. assert(isinstance(module_path, str) and
  256. isinstance(name, str))
  257. except AssertionError:
  258. err = "Parameters module_path and name must be of str type"
  259. self._logger.log_and_raise_error(err, ErrorType=NameError)
  260. try:
  261. self._space = LoadingUtils().load_from_module(
  262. module_path=module_path, name=name)
  263. self._logger.info("Attached parameter distribution space")
  264. self.attached_space = True
  265. except Exception as e:
  266. err = ("Failed to attach space from module. "
  267. "Exit with error {}".format(e))
  268. self._logger.loger_and_raise_error(err)
  269. def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
  270. y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
  271. X_val: (pd.DataFrame, np.ndarray) = None,
  272. y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
  273. cv: (list, int) = None):
  274. '''
  275. :param array X_train: data on which
  276. machine learning pipelines are trained
  277. :param array y_train: optional, vector with targets,
  278. (not all algorithms require a targets)
  279. :param array X_val: optional, validation data.
  280. When not provided, cross-validated value
  281. of the cost_func is calculated.
  282. :param array y_val: optional, validation targets
  283. :param list cv: list of tuples containing
  284. train and validation indices or an integer representing
  285. the number of folds for a random split of data
  286. during cross-validation
  287. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  288. '''
  289. NoneType = None.__class__
  290. input_err = "Non-valid combination of train and val data types"
  291. if cv is None:
  292. try:
  293. assert(isinstance(X_train, (pd.DataFrame, np.ndarray)) and
  294. isinstance(X_val, (pd.DataFrame, np.ndarray)) and
  295. isinstance(y_train, (pd.Series, np.ndarray,
  296. pd.DataFrame, NoneType)) and
  297. isinstance(y_val, (pd.Series, np.ndarray)) and
  298. (y_val is None) == (y_train is None))
  299. except AssertionError:
  300. self._logger.log_and_raise_error(input_err)
  301. try:
  302. # cost is evaluated with a cross validation function
  303. # that accepts an array and a cv object with
  304. # indices of the fold splits.
  305. # Here we create a trivial cv object
  306. # with one validation split.
  307. train_inds = list(range(len(X_train)))
  308. val_inds = list(range(len(X_train),
  309. len(X_train) + len(X_val)))
  310. self._cv = [(train_inds, val_inds)]
  311. self._X = np.concatenate([X_train, X_val])
  312. self._y = None if y_train is None\
  313. else np.concatenate([y_train, y_val])
  314. except Exception as e:
  315. err = "Failed to attach data. Exit with error: {}".format(e)
  316. self._logger.log_and_raise_error(err)
  317. else:
  318. try:
  319. assert(isinstance(X_train, (pd.DataFrame, np.ndarray)) and
  320. isinstance(y_train, (pd.Series, np.ndarray,
  321. pd.DataFrame, NoneType)) and
  322. (X_val is None) and (y_val is None))
  323. except AssertionError:
  324. self._logger.log_and_raise_error(input_err)
  325. self._cv = cv
  326. self._X = X_train
  327. self._y = y_train
  328. self._logger.info("Attached data")
  329. self.attached_data = True
  330. def attach_data_from_hdf5(self,
  331. data_hdf5_store_path: str,
  332. cv_pickle_path: str = None):
  333. """
  334. Method for attaching data from a hdf5 store.
  335. The hdf5 store is a binary file,
  336. after loading it, it is a dictionary with keys
  337. X_train (y_train, X_val, y_val). The cv is loaded
  338. from a pickle file. The reason to separate the data
  339. store from the cv store, is the hdf5 is optimized to
  340. store large dataframes (especially with simple types) and
  341. a a small list of lists like a cv-object is better
  342. to be stored as a pickle file.
  343. :param str data_hdf5_store_path: path to the hdf5 store
  344. with train and validation data
  345. :param str cv_pickle_path: path to the pickle file with
  346. the cv data
  347. """
  348. try:
  349. assert(os.path.isfile(data_hdf5_store_path))
  350. except AssertionError:
  351. err = "Parameter hdf5_store_path is not a file"
  352. self._logger.log_and_raise_error(err, ErrorType=NameError)
  353. # load the hdf5 store
  354. try:
  355. store = pd.HDFStore(data_hdf5_store_path)
  356. self._data_path = data_hdf5_store_path
  357. except Exception as e:
  358. err = "Could not load the hdf5 store. Exit with error: {}."\
  359. .format(e)
  360. self._logger.log_and_raise_error(err)
  361. data_input = {}
  362. for key in ["/X_train", "/y_train", "/X_val", "/y_val"]:
  363. if key not in store.keys():
  364. data_input[key.replace("/", "")] = None
  365. else:
  366. data_input[key.replace("/", "")] = store[key]
  367. if cv_pickle_path is not None:
  368. try:
  369. assert(os.path.isfile(cv_pickle_path))
  370. except AssertionError:
  371. err = "Parameter hdf5_store_path is not a file"
  372. self._logger.log_and_raise_error(err, ErrorType=NameError)
  373. try:
  374. data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
  375. self._cv_path = cv_pickle_path
  376. except Exception as e:
  377. err = "Could not load the pickeled cv. Exit with error: {}."\
  378. .format(e)
  379. self._logger.log_and_raise_error(err)
  380. else:
  381. data_input["cv"] = None
  382. self.attach_data(**data_input)
  383. store.close()
  384. def configer_summary_saving(self,
  385. save_method: Callable = None,
  386. kwargs: dict = None):
  387. """
  388. Attaching a method for saving information about
  389. the trials/space/strategy and the result of
  390. the current best pipeline. This method can
  391. save the result in a txt or a json file,
  392. or in a database for example. Arguments like
  393. file path or the table name can be specified in kwargs.
  394. :param Callable save_method: method for saving the result
  395. of the pipeline selection. The method must accept
  396. a pandas DataFrame as argument. See self._save_result
  397. method for the format of the argument being saved.
  398. By default, saving to a csv file.
  399. Examples:
  400. functools.partial(pd.DataFrame.to_csv,
  401. **{"path_or_buf": <PATH>})
  402. functools.partial(np.savetxt, **{"fname": <PATH>})
  403. functools.partial(SQLHandler(<URI>).append_to_table,
  404. **{"tablename": <NAME>})
  405. functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
  406. **{"collection_name": <NAME>})
  407. using functools can be avoided by providing the kwarg argument
  408. :param dict kwargs: a dictionary with keyword arguments
  409. (like tablename) to provide to the save_method
  410. """
  411. try:
  412. save_method = save_method or functools.partial(
  413. pd.DataFrame.to_excel, **{"path_or_buf": "result.csv"})
  414. kwargs = kwargs or {}
  415. self._save_method = functools.partial(save_method, **kwargs)
  416. self.configured_summary_saving = True
  417. self._logger.info("Configured summary saving")
  418. except Exception as e:
  419. err = ("Failed to configure the summary saving. "
  420. "Exit with error {}".format(e))
  421. self._logger.log_and_raise_error(err)
  422. def _save_summary(self, summary: dict):
  423. """
  424. """
  425. try:
  426. assert(self.configured_summary_saving)
  427. except AssertionError:
  428. err = "Result saving must be configured first"
  429. self._logger.log_and_raise_error(err, ErrorType=AssertionError)
  430. try:
  431. self._save_method(summary)
  432. except Exception as e:
  433. err = ("Could not configure summary saving. "
  434. "Exit with error: {}".format(e))
  435. self._logger.log_and_raise_error(err)
  436. def _evaluate(self, pipeline: Pipeline,
  437. scoring: Callable = None,
  438. cross_validation: Callable = None) -> dict:
  439. """
  440. This method is called in _objective.
  441. Calculates the cost on the attached data.
  442. This function can be overriden, when the cost
  443. needs to be calculated differently,
  444. for example with a tensorflow model.
  445. :param Pipeline pipeline: machine learning pipeline
  446. that will be evaluated with cross-validation
  447. :param cross_validation: a function that has the same
  448. signature as sklearn.model_selection.cross_validate
  449. :return: dictionary with the aggregated
  450. cross-validation score and
  451. the score variance.
  452. """
  453. try:
  454. scoring = {"score": make_scorer(self._cost_func)}
  455. scoring.update({metric_name: make_scorer(metric)
  456. for metric_name, metric
  457. in self._additional_metrics.items()})
  458. scores = self._cross_validation(
  459. estimator=pipeline,
  460. X=self._X,
  461. y=self._y,
  462. cv=self._cv or 5,
  463. scoring=scoring,
  464. error_score=np.nan)
  465. scores_average = {
  466. metric_name.replace("test_", ""):
  467. self._cross_val_averaging_func(scores[metric_name])
  468. for metric_name in scores
  469. if metric_name.startswith("test")}
  470. scores_variance = {
  471. metric_name.replace("test_", "") + "_variance":
  472. np.var(scores[metric_name])
  473. for metric_name in scores
  474. if metric_name.startswith("test")}
  475. return {**scores_average, **scores_variance}
  476. except Exception as e:
  477. err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
  478. self._logger.log_and_raise_error(err)
  479. def _objective(self, space_element: dict) -> dict:
  480. '''
  481. This method is called in search_for_best_pipeline
  482. inside the hyperopt fmin method.
  483. Uses _evaluate method.
  484. It must take as input a space element
  485. and produce an output in the form of dictionary
  486. with 2 obligatory values loss and status
  487. (STATUS_OK or STATUS_FAIL). Other
  488. values in the output are optional and can be
  489. accessed later through the trials object.
  490. :Warning: fmin minimizes the loss,
  491. when _evaluate returns a value to be maximized,
  492. it should be multiplied by -1 to obtain loss.
  493. :param dict space_element: must contain keys
  494. name (with the name of the pipeline),
  495. pipeline (Pipeline object),
  496. params (dict of pipeline params)
  497. :output: dictionary with keys
  498. loss (minimized value),
  499. status with values STATUS_OK or STATUS_FAIL
  500. uderstood by hyperopt,
  501. score (equal to loss or -loss),
  502. score_variance,
  503. timestamp (end of execution),
  504. train_time: execution time
  505. '''
  506. try:
  507. assert(isinstance(space_element, dict) and
  508. set(['name', 'pipeline', 'params']) <= space_element.keys())
  509. assert(isinstance(space_element['name'], str) and
  510. isinstance(space_element['pipeline'], Pipeline) and
  511. isinstance(space_element['params'], dict))
  512. except AssertionError:
  513. err = "Space elements are of wrong form"
  514. self._logger.log_and_raise_error(err)
  515. start_time = time.time()
  516. try:
  517. assert(self.attached_data)
  518. except AssertionError:
  519. err = ("Data must be attached in order "
  520. "in order to effectuate the best"
  521. "pipeline search")
  522. self._logger.log_and_raise_error(err)
  523. summary = {}
  524. if self._strategy_name is not None:
  525. summary["strategy_name"] = self._strategy_name
  526. if isinstance(self._cost_func, str):
  527. summary["cost_func"] = self._cost_func
  528. elif hasattr(self._cost_func, "__name__"):
  529. summary["cost_func"] = self._cost_func.__name__
  530. summary["trials_path"] = self.trials_path
  531. if self._data_path is not None:
  532. summary["data_path"] = self._data_path
  533. if self._cv_path is not None:
  534. summary["cv_path"] = self._cv_path
  535. summary["start_tuning_time"] = self.start_tuning_time
  536. summary["iteration"] = self._iteration
  537. backup_cond = (self._backup_trials_freq is not None) and\
  538. ((self._iteration - self._start_iteration - 1) %
  539. self._backup_trials_freq == 0) or\
  540. self._score_improved
  541. if backup_cond:
  542. self._backup_trials()
  543. self._score_improved = False
  544. try:
  545. pipeline = space_element['pipeline']
  546. params = space_element['params']
  547. pipeline.set_params(**params)
  548. self._logger.info(("Iteration {0}: "
  549. "Current score is {1}: "
  550. "Training pipeline {2} "
  551. "with parameters: {3}. ").format(
  552. self._iteration,
  553. self.best_score,
  554. space_element['name'],
  555. params))
  556. result = self._evaluate(pipeline)
  557. summary.update(result)
  558. end_time = time.time()
  559. assert(not np.isnan(result["score"])),\
  560. "Score value is not in the output of the _evaluate method"
  561. summary['status'] = STATUS_OK
  562. summary.update(result)
  563. summary['loss'] = self._score_factor * summary['score']
  564. summary['timestamp'] = datetime.datetime.today()
  565. summary['train_time'] = end_time - start_time
  566. self._iteration += 1
  567. self._score_improved = (self.best_score != self.best_score) or\
  568. (self._score_factor*result["score"] <
  569. self._score_factor*self.best_score)
  570. if self._score_improved:
  571. self._logger.info("Score improved, new best score is: {}"
  572. .format(result["score"]))
  573. self.best_score = result['score']
  574. if self.configured_summary_saving:
  575. self._save_summary(summary)
  576. except Exception as e:
  577. self._logger.warning("Trial failed with error {}".format(e))
  578. summary['status'] = STATUS_FAIL
  579. summary['timestamp'] = datetime.datetime.today()
  580. summary['error'] = e
  581. for key in ['loss', 'score', 'score_variance', 'train_time']:
  582. summary[key] = np.nan
  583. return summary
  584. @abstractmethod
  585. def run_trials(self):
  586. """
  587. Method that runs the hyperparameter tuning over possibly multiple
  588. pipeline types specified in self.space
  589. When run_trials method is finished the flag self.finished_tuning
  590. should be set to True and the methods self._backup_trials and
  591. optionally self._save_result should be called.
  592. """
  593. pass
  594. @abstractproperty
  595. def number_of_trials(self) -> int:
  596. """
  597. Number of trials already run in the current trials object
  598. """
  599. pass
  600. @abstractproperty
  601. def best_trial(self) -> dict:
  602. """
  603. Best trial sor far.
  604. Should contain the best pipeline,
  605. best hyperparameters,
  606. as well as an output of the self._objective method,
  607. but the exact form of the output depends on the implementation
  608. of the Trials object.
  609. """
  610. pass
  611. @abstractproperty
  612. def best_trial_score(self) -> float:
  613. """
  614. Score of the best pipeline with the best hyperparameters
  615. """
  616. pass
  617. @abstractproperty
  618. def best_trial_score_variance(self) -> float:
  619. """
  620. """
  621. pass
  622. @abstractproperty
  623. def best_trial_pipeline(self) -> Pipeline:
  624. """
  625. Best pipeline with best hyperparameters
  626. """
  627. pass
  628. @abstractmethod
  629. def get_n_best_trial_pipelines(self, n: int) -> list:
  630. """
  631. N best pipelines with corresponding
  632. best hyperparameters
  633. """
  634. pass
  635. @abstractmethod
  636. def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
  637. """
  638. If the hyperparameter search is done over multiple
  639. pipelines, then returns n different pipeline-types
  640. with corresponding hyperparameters
  641. """
  642. pass
  643. @abstractmethod
  644. def trials_to_excel(self, path: str):
  645. """
  646. Trials object in the shape of table written to excel,
  647. should contain the iteration, pipeline (as str),
  648. hyperparamters (as str), self.best_result (see self._objective method)
  649. as well as additional information configured
  650. through self.save_result method.
  651. """
  652. pass