HyperoptPipelineSelection.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Nov 9 13:27:44 2018
  5. @author: tanja
  6. @description: Implementation of machine learning
  7. pipeline selection and tuning with hyperopt library
  8. """
  9. import os
  10. import sys
  11. import gc
  12. import logging
  13. import pickle
  14. import time
  15. import datetime
  16. import pandas as pd
  17. import numpy as np
  18. from sklearn.pipeline import Pipeline
  19. from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
  20. space_eval, pyll
  21. from sklearn.model_selection import cross_validate
  22. class HyperoptPipelineSelection:
  23. '''
  24. Use this class to perform a search
  25. for a machine learning pipeline in a given parameter space.
  26. The parameter space can include multiple types of Pipelines
  27. (SVM, XGBOOST, random forest, etc),
  28. as well as parameter distributions for each pipeline parameter.
  29. See example in main for the expected space structure.
  30. The search can be performed either randomly
  31. or with a tree-based algorithm. (Other methods are currently
  32. developped by hyperopt creators).
  33. Attribute trials is responsible for book-keeping parameter
  34. combinations that have already been tried out. This attribute
  35. is saved to a binary file every n minutes as well as every time
  36. a better pipeline was found.
  37. '''
  38. def __init__(self,
  39. cost_func,
  40. greater_is_better: bool,
  41. trials_path: str,
  42. backup_trials_freq: int = 1,
  43. log_path: str = None,
  44. averaging_func: callable = None):
  45. '''
  46. :param callable cost_func: function to minimize or maximize
  47. :param bool greater_is_better: when True
  48. cost_func is maximized, else minimized.
  49. :param str trials_path: path at which the trials object is saved
  50. in binary format. From the trials object we can
  51. select information about the obtained scores, score variations,
  52. and pipelines, and parameters tried out so far. If a trials object
  53. already exists at the given path, it is loaded and the
  54. search is continued, else, the search is started from
  55. the beginning.
  56. :param backup_trials_freq: frequecy in interations (trials)
  57. of saving the trials object at the trials_path.
  58. :param str log_path: Optional, when not provided logs to stdout.
  59. :param callable averaging_func: optional,
  60. when not provided set to mean. Function
  61. to aggregate the cross-validated values of the cost function.
  62. Classic situation is to take the mean,
  63. another example is, for example mean() - c*var().
  64. '''
  65. assert(callable(cost_func)),\
  66. "Parameter 'cost_func' must be a callable"
  67. assert(isinstance(greater_is_better, bool)),\
  68. "Parameter 'greater_is_better' must be bool type"
  69. assert(isinstance(trials_path, str)),\
  70. "Parameter 'trials_path' must be of string type"
  71. if averaging_func is not None:
  72. assert(callable(averaging_func)),\
  73. "Parameter 'averaging_func' must be a callable"
  74. self._assert_valid_directory(path=trials_path)
  75. self._configer_logger(log_path)
  76. self._cost_func = cost_func
  77. # is 1 when cost_func is minimized, -1 when cost func is maximized
  78. self._score_factor = (not greater_is_better) - greater_is_better
  79. self._trials_path = trials_path
  80. # is initialized with empty trials object
  81. self._trials = Trials()
  82. self._backup_trials_freq = backup_trials_freq
  83. self._averaging_func = averaging_func or np.mean
  84. # keeping track of the current search iteration
  85. self._run_number = 0
  86. # space and data need to be attached to perform search.
  87. self._space_attached = False
  88. self._data_attached = False
  89. # if a trials object already exists at the given path,
  90. # it is loaded and the search is continued. Else,
  91. # the search is started from the beginning.
  92. if os.path.isfile(trials_path):
  93. try:
  94. with open(trials_path, "rb") as f:
  95. self._trials = pickle.load(f)
  96. self._logger.info(("Loaded an existing trials object"
  97. "Consisting of {} trials")
  98. .format(len(self._trials.trials)))
  99. except Exception as e:
  100. self._logger.error(("Trials object could not be loaded. "
  101. "Training starts from the beginning. "
  102. "Exit with error {}").format(e))
  103. else:
  104. self._logger.info(("No existing trials object was found"
  105. "Initialized an empty trials object."))
  106. self._best_score = self.best_trial_score
  107. def _configer_logger(self, log_path: str = None):
  108. '''
  109. Can be replaced with the existing script later.
  110. When log_path is not provided, logs to stdout.
  111. '''
  112. self._logger = logging.getLogger(__name__)
  113. if (self._logger.hasHandlers()):
  114. self._logger.handlers.clear()
  115. if log_path is not None:
  116. assert(isinstance(log_path, str)),\
  117. "Parameter 'log_path' must be of string type"
  118. self._assert_valid_directory(log_path)
  119. handler = logging.FileHandler(log_path)
  120. else:
  121. handler = logging.StreamHandler(sys.stdout)
  122. formatter = logging.Formatter(
  123. '\n %(asctime)s %(levelname)s %(message)s')
  124. handler.setFormatter(formatter)
  125. self._logger.addHandler(handler)
  126. self._logger.setLevel("INFO")
  127. def _backup_trials(self):
  128. '''
  129. Pickles (Saves) the trials object.
  130. Used in a scheduler.
  131. '''
  132. with open(self._trials_path, "wb") as f:
  133. pickle.dump(self._trials, f)
  134. def _assert_valid_directory(self, path: str):
  135. '''
  136. If the directory of a path does not exist yet,
  137. creates it.
  138. '''
  139. assert(isinstance(path, str)),\
  140. "Parameter 'path' must of str type"
  141. dirname = os.path.dirname("path")
  142. if len(dirname) > 0:
  143. os.mkdir(dirname, exists_ok=True)
  144. def attach_space(self, space: pyll.base.Apply = None,
  145. module_path: str = None,
  146. name: str = None):
  147. '''
  148. :param pyll.base.Apply space: hyperopt space where
  149. the search is performed. Optional when a space
  150. is loaded from a python module.
  151. :param str module_path: path to python module
  152. where the space is defined. Optional when
  153. the space is provided directly.
  154. :param str name: name of the space loaded from
  155. a python module. Optional when the space
  156. is provided directly.
  157. '''
  158. assert((space is not None) or
  159. ((module_path is not None) and (name is not None))),\
  160. "Either space or (module_path, name) must be provided"
  161. if space is None:
  162. for p in ["modele_path", "name"]:
  163. assert(isinstance(p, str)),\
  164. "Parameter '{}' must be of str type".format(p)
  165. assert(os.path.isfile(module_path)),\
  166. "Parameter 'module_path' must be a valid file"
  167. module, extension = os.path.splitext(os.path.basename(module_path))
  168. assert(extension == ",py"),\
  169. "Parameter 'space' must be read from a python file"
  170. sys.path.insert(module_path)
  171. try:
  172. from module import name as space
  173. except ImportError:
  174. err = "Invalid space location or name"
  175. self._logger.error(err)
  176. raise Exception(err)
  177. assert(isinstance(space, pyll.base.Apply)),\
  178. "Parameter 'space' must be of hyperopt space type"
  179. self._space = space
  180. self._logger.info("Attached parameter distribution space")
  181. self._space_attached = True
  182. def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
  183. -> np.ndarray:
  184. '''
  185. Converts an DataFrame to an numpy array.
  186. '''
  187. if isinstance(x, np.ndarray):
  188. return x
  189. elif (isinstance(x, pd.core.frame.DataFrame))\
  190. or (isinstance(x, pd.core.series.Series)):
  191. return x.values
  192. else:
  193. e = 'The argument must be a numpy array or a pandas DataFrame'
  194. self._logger.critical(e)
  195. raise ValueError(e)
  196. def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
  197. y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
  198. X_val: (pd.DataFrame, np.ndarray) = None,
  199. y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
  200. cv: (list, int) = None):
  201. '''
  202. :param array X_train: data on which
  203. machine learning pipelines are trained
  204. :param array y_train: optional, vector with targets,
  205. (not all algorithms require a targets)
  206. :param array X_val: optional, validation data.
  207. When not provided, cross-validated value
  208. of the cost_func is calculated.
  209. :param array y_val: optional, validation targets
  210. :param list cv: list of tuples containing
  211. train and validation indices or an integer representing
  212. the number of folds for a random split of data
  213. during cross-validation
  214. example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
  215. '''
  216. X_train = self._convert_to_array(X_train)
  217. if y_train is not None:
  218. y_train = self._convert_to_array(y_train)
  219. if X_val is not None:
  220. if cv is not None:
  221. self._logger.warning(("Both validation set and cv object "
  222. "are set. Validation score will be "
  223. "calculated on the validation set!"))
  224. X_val = self._convert_to_array(X_val)
  225. train_inds = list(range(len(X_train)))
  226. val_inds = list(range(len(X_train),
  227. len(X_train) + len(X_val)))
  228. # cost is evaluated with a cross validation function
  229. # that accepts an array and a cv object with
  230. # indices of the fold splits.
  231. # Here we create a trivial cv object
  232. # with one validation split.
  233. self._cv = [(train_inds, val_inds)]
  234. self._X = np.concatenate([X_train, X_val])
  235. if y_train is not None:
  236. if y_val is None:
  237. err = "Argument y_val must be provided"
  238. self._logger.critical(err)
  239. raise ValueError(err)
  240. else:
  241. y_val = self._convert_to_array(y_val)
  242. self._y = np.concatenate([y_train, y_val])
  243. else:
  244. self._y = None
  245. else:
  246. if cv is None:
  247. self._logger.warning(("Neither validation set nor cv object "
  248. "are set. Validation score will be "
  249. "calculated on 5 randomly "
  250. "splitted folds."))
  251. self._X = X_train
  252. self._y = y_train
  253. self._cv = cv
  254. self._logger.info("Attached data")
  255. self._data_attached = True
  256. def _evaluate(self, pipeline: Pipeline) -> dict:
  257. '''
  258. This method is called in _objective.
  259. Calculates the cost on the attached data.
  260. This function can be overriden, when the cost
  261. needs to be calculated differently,
  262. for example with a tensorflow model.
  263. :param Pipeline pipeline: machine learning pipeline
  264. that will be evaluated with cross-validation
  265. :output: dictionary with the aggregated
  266. cross-validation score and
  267. the score variance.
  268. '''
  269. scores = cross_validate(estimator=pipeline,
  270. X=self._X,
  271. y=self._y,
  272. cv=self._cv or 5,
  273. scoring=make_scorer(self._cost_func),
  274. error_score=np.nan)
  275. return {'value': self._averaging_func(scores['test_score']),
  276. 'variance': np.var(scores['test_score'])}
  277. def _objective(self, space_element: dict) -> dict:
  278. '''
  279. This method is called in search_for_best_pipeline
  280. inside the hyperopt fmin method.
  281. Uses _evaluate method.
  282. It must take as input a space element
  283. and produce an output in the form of dictionary
  284. with 2 obligatory values loss and status
  285. (STATUS_OK or STATUS_FAIL). Other
  286. values in the output are optional and can be
  287. accessed later through the trials object.
  288. :Warning: fmin minimizes the loss,
  289. when _evaluate returns a value to be maximized,
  290. it should be multiplied by -1 to obtain loss.
  291. :param dict space_element: must contain keys
  292. name (with the name of the pipeline),
  293. pipeline (Pipeline object),
  294. params (dict of pipeline params)
  295. :output: dictionary with keys
  296. loss (minimized value),
  297. status with values STATUS_OK or STATUS_FAIL
  298. uderstood by hyperopt,
  299. score (equal to loss or -loss),
  300. score_variance,
  301. timestamp (end of execution),
  302. train_time: execution time
  303. '''
  304. assert(isinstance(space_element, dict) and
  305. set(['name', 'pipeline', 'params']) <= space_element.keys())
  306. assert(isinstance(space_element['name'], str) and
  307. isinstance(space_element['pipeline'], Pipeline) and
  308. isinstance(space_element['params'], dict))
  309. start_time = time.time()
  310. if not self._data_attached:
  311. raise Exception(("Data must be attached in order "
  312. "in order to effectuate the best"
  313. "pipeline search"))
  314. self._run_number += 1
  315. pipeline = space_element['pipeline']
  316. params = space_element['params']
  317. pipeline.set_params(**params)
  318. self._logger.info(("Run number {0}: "
  319. "Current score is {1}: "
  320. "Training pipeline {2} "
  321. "with parameters: {3}. ").format(
  322. self._run_number,
  323. self._best_score,
  324. space_element['name'],
  325. params))
  326. try:
  327. score_stats = self._evaluate(pipeline)
  328. assert(not np.isnan(score_stats["value"])),\
  329. "Returned null score"
  330. if self._run_number % self._backup_trials_freq == 0:
  331. self._backup_trials()
  332. if (self._best_score != self._best_score) or\
  333. self._score_factor*score_stats["value"] <\
  334. self._score_factor*self._best_score:
  335. self._logger.info("Score got better, new best score is: {}"
  336. .format(score_stats["value"]))
  337. self._best_score = score_stats['value']
  338. self._backup_trials()
  339. end_time = time.time()
  340. return {'loss': self._score_factor * score_stats["value"],
  341. 'status': STATUS_OK,
  342. 'score': score_stats["value"],
  343. 'score_variance': score_stats["variance"],
  344. 'timestamp': datetime.datetime.today(),
  345. 'train_time': end_time - start_time}
  346. except Exception as e:
  347. self._logger.warning("Trial failed with error {}".format(e))
  348. return {'loss': np.nan,
  349. 'status': STATUS_FAIL,
  350. 'score': np.nan,
  351. 'score_variance': np.nan,
  352. 'timestamp': datetime.datetime.today(),
  353. 'train_time': np.nan}
  354. def search_for_best_pipeline(self,
  355. niter: int,
  356. algo: callable = tpe.suggest):
  357. '''
  358. Method performing the search of the best pipeline in the given space.
  359. Calls fmin function from the hyperopt library to minimize the output of
  360. _objective.
  361. :params int niter: number of search iterations
  362. :param callable algo: now can only take values tpe for a tree-based
  363. random search or random for random search
  364. '''
  365. assert(self._space_attached),\
  366. "Space must be attach to be able to retrieve this information."
  367. assert(isinstance(niter, int)),\
  368. "Parameter 'niter' must be of int type"
  369. # right now only two algorithms are provided by
  370. assert(algo in [tpe.suggest, rand.suggest]),\
  371. ("Parameter 'algo' can be now only tpe or random. "
  372. "If other algorithms have been developped by "
  373. "by hyperopt, plased add them to the list.")
  374. try:
  375. self._logger.info(("Starting {0} iterations of search "
  376. "additional to {1} previous"
  377. .format(niter, len(self._trials.trials))))
  378. best = fmin(fn=self._objective,
  379. space=space,
  380. algo=algo,
  381. trials=self._trials,
  382. max_evals=len(self._trials.trials) + niter)
  383. # print('AAAA', str(niter))
  384. self._logger.info(
  385. "Best score is {0} with variance {1}"
  386. .format(
  387. self._trials.best_trial["result"]["score"],
  388. self._trials.best_trial["result"]["score_variance"]))
  389. self._logger.info(("Finished {0} iterations of search.\n"
  390. "Best parameters are:\n {1} ")
  391. .format(niter,
  392. space_eval(space, best)))
  393. self._backup_trials()
  394. except Exception as e:
  395. raise ValueError(("Failed to select best "
  396. "pipeline! Exit with error: {}").format(e))
  397. @property
  398. def best_trial_score(self) -> float:
  399. '''
  400. '''
  401. if len(self._trials.trials) > 0:
  402. return self._trials.best_trial["result"]["score"]
  403. else:
  404. return np.nan
  405. @property
  406. def best_trial_score_variance(self) -> float:
  407. '''
  408. '''
  409. if len(self._trials.trials) > 0:
  410. return self._trials.best_trial["result"]["score_variance"]
  411. else:
  412. return np.nan
  413. @property
  414. def best_trial_pipeline(self) -> Pipeline:
  415. '''
  416. '''
  417. assert(self._space_attached),\
  418. "Space must be attach to be able to retrieve this information."
  419. if len(self._trials.trials) > 0:
  420. return space_eval(
  421. space,
  422. {k: v[0] for k, v in
  423. self._trials.best_trial['misc']['vals'].items()
  424. if len(v) > 0})["pipeline"]
  425. else:
  426. err = ("Trials object is empty. "
  427. "Best pipeline cannot be returned")
  428. self._logger.error(err)
  429. raise Exception(err)
  430. def _ith_trial_loss(self, i: int) -> float:
  431. '''
  432. '''
  433. if len(self._trials.trials) >= i:
  434. return self._trials.trials[i]['result']['loss']
  435. else:
  436. return np.nan
  437. def _ith_trial_element(self, i: int, name: str) -> object:
  438. '''
  439. '''
  440. assert(self._space_attached),\
  441. "Space must be attach to be able to retrieve this information."
  442. if len(self._trials.trials) >= i:
  443. return space_eval(self._space,
  444. {k: v[0] for k, v in
  445. self._trials.trials[i]['misc']['vals']
  446. .items() if len(v) > 0})[name]
  447. def _ith_trial_pipeline(self, i: int) -> Pipeline:
  448. '''
  449. '''
  450. return self._ith_trial_element(i=i, name='pipeline')
  451. def _ith_trial_name(self, i: int) -> str:
  452. '''
  453. '''
  454. return self._ith_trial_element(i=i, name='name')
  455. def _ith_trial_params(self, i: int) -> dict:
  456. '''
  457. '''
  458. return self._ith_trial_element(i=i, name='params')
  459. def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
  460. '''
  461. '''
  462. if len(self._trials.trials) >= i:
  463. return self._trials.trials[i]["result"]["timestamp"]
  464. def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
  465. '''
  466. Returns the list of n best pipelines
  467. documented in trials
  468. '''
  469. if len(self._trials.trials) > 0:
  470. if losses is None:
  471. losses = [self._ith_trial_loss(i)
  472. for i in range(len(self._trials.trials))]
  473. best_n_indices = [losses.index(l)
  474. for l in sorted(list(set(losses)))[:n]]
  475. return [self._ith_trial_pipeline(i) for i in best_n_indices]
  476. else:
  477. err = ("Trials object is empty. "
  478. "Best pipeline cannot be returned")
  479. self._logger.error(err)
  480. raise Exception(err)
  481. def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
  482. '''
  483. Returns a dictiionry where keys are pipeline names,
  484. and values are lists of best pipelines with this name
  485. '''
  486. assert(isinstance(n, int)), "Parameter 'n' must be an integer"
  487. if len(self._trials.trials) > 0:
  488. best_pipelines_per_type = {}
  489. names = [self._ith_trial_name(i)
  490. for i in range(len(self._trials.trials))]
  491. for nm in names:
  492. losses = [self._ith_trial_loss(i)
  493. for i in range(len(self._trials.trials))
  494. if self._ith_trial_name(i) == nm]
  495. best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
  496. n=n,
  497. losses=losses)
  498. return best_pipelines_per_type
  499. else:
  500. err = ("Trials object is empty. "
  501. "Best pipeline cannot be returned")
  502. self._logger.error(err)
  503. raise Exception(err)
  504. def write_trials_documentation(self, path: str = None):
  505. '''
  506. Saves an excel file with pipeline names, scores,
  507. parameters, and timestamps.
  508. '''
  509. if len(self._trials.trials) > 0:
  510. path = path or "hyperopt_trials_documentation.xlsx"
  511. assert(isinstance(path, str)),\
  512. "Parameter 'path' must be of string type"
  513. self._assert_valid_directory(path)
  514. names = [self._ith_trial_name(i)
  515. for i in range(len(self._trials.trials))]
  516. scores = [self._score_factor*self._ith_trial_loss(i)
  517. for i in range(len(self._trials.trials))]
  518. params = [self._ith_trial_params(i)
  519. for i in range(len(self._trials.trials))]
  520. timestamps = [self._ith_trial_timestamp(i)
  521. for i in range(len(self._trials.trials))]
  522. else:
  523. names = []
  524. scores = []
  525. params = []
  526. timestamps = []
  527. pd.DataFrame({"name": names,
  528. "score": scores,
  529. "params": params,
  530. "timestamp": timestamps})\
  531. .to_excel(path)
  532. if __name__ == '__main__':
  533. from sklearn.metrics import roc_auc_score, make_scorer
  534. from xgboost import XGBClassifier
  535. from sklearn.svm import SVC
  536. from sklearn.feature_selection import SelectKBest
  537. from sklearn.decomposition import PCA
  538. from sklearn.datasets import load_iris
  539. from pprint import pprint
  540. data = load_iris()
  541. X = pd.DataFrame(data.data)
  542. y = pd.Series(data.target)
  543. # produce a binory variable
  544. y = (y == 2).astype(int)
  545. del data
  546. gc.collect()
  547. # SPACE DEFINITION ########################################
  548. # (can be moved to a separate python script)
  549. """
  550. A search space must be a list of dictionaries.
  551. Each dictionry must have keys:
  552. name (pipeline name or type),
  553. pipeline (instance of sklearn.pipeline.Pipeline),
  554. params (dictionary of distributions for the parameters of
  555. the pipeline that we want to tune)
  556. Here we have a space that consists of two dictionaries:
  557. KBEST_XGBOOST and PCA_SVC
  558. """
  559. space = []
  560. pipeline_dist_1 = {}
  561. pipeline_dist_1["name"] = "KBEST_XGBOOST"
  562. """
  563. A pipeline consists of steps (tuples).
  564. Each step has a name and an algorithm.
  565. This pipeline, as a first step performs
  566. feature selection with SelectKBest and
  567. as a second step evaluates a machine learning algo (xgboost).
  568. Like all sklearn algorithms, a Pipeline has methods
  569. fit, predict, set_params, get_params
  570. """
  571. pipeline_dist_1["pipeline"] = Pipeline([
  572. ('kbest', SelectKBest()),
  573. ('xgb', XGBClassifier())
  574. ])
  575. """
  576. Pipeline parameter dictionaries must be of the form:
  577. {'kbest__k': 3, xgb__n_estimators: 20},
  578. each parameter name consists of the step name, __, and parameter name.
  579. Here, instead of values, the parameter names are followed
  580. by hyperopt distributions.
  581. Each hyperopt distribution also must have a name,
  582. due to hyperopt functionality.
  583. Here, we set the hyperopt distribution name to the step name,
  584. but it does not have to be so. Hyperopt distribution names
  585. must be different for different elements of the space.
  586. """
  587. pipeline_dist_1["params"] = {
  588. 'kbest__k': hp.choice('kbest__k', range(1, 5)),
  589. 'xgb__n_estimators':
  590. 50 + hp.randint('xgb__n_estimators', 50),
  591. "xgb__learning_rate":
  592. hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
  593. }
  594. space.append(pipeline_dist_1)
  595. pipeline_dist_2 = {}
  596. pipeline_dist_2["name"] = "PCA_SVC"
  597. pipeline_dist_2["pipeline"] = Pipeline([
  598. ('pca', PCA()),
  599. ('svc', SVC(gamma="scale"))
  600. ])
  601. pipeline_dist_2["params"] = {
  602. "pca__n_components": 1 + hp.randint("pca__n_components", 4),
  603. "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
  604. }
  605. space.append(pipeline_dist_2)
  606. space = hp.choice('pipelines', space)
  607. # TESTING ##########################################################
  608. trials_path = 'TEST_hyperopt_trials.pkl'
  609. doc_path = 'TEST_hyperopt_doc.xlsx'
  610. hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
  611. greater_is_better=True,
  612. trials_path=trials_path)
  613. hp_obj.attach_data(X_train=X, y_train=y)
  614. hp_obj.attach_space(space=space)
  615. hp_obj.search_for_best_pipeline(niter=10)
  616. print('\n', '='*20, 'TESTING', '='*20)
  617. print('\n', 'Best score:', hp_obj.best_trial_score)
  618. print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
  619. print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
  620. print('\n', 'Best 3 pipelines: \n')
  621. pprint(hp_obj.get_n_best_trial_pipelines(n=3))
  622. print('\n', 'Best pipeline per type: \n')
  623. pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
  624. hp_obj.write_trials_documentation(path=doc_path)
  625. # os.remove(doc_path)
  626. # os.remove(trials_path)