composed_space_sample.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Mon Jul 6 14:02:24 2020
  5. @author: tanya
  6. @description: space object to pass to HyperoptPipelineSelection class
  7. """
  8. from sklearn.ensemble import RandomForestClassifier
  9. from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
  10. RFE, SelectFpr, f_classif, chi2, mutual_info_classif
  11. from xgboost import XGBRFClassifier
  12. from sklearn.svm import SVC
  13. from sklearn.linear_model import LogisticRegression
  14. from sklearn.decomposition import PCA
  15. from sklearn.pipeline import Pipeline
  16. from sklearn.preprocessing import StandardScaler
  17. from hyperopt import hp
  18. from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
  19. # TODO: add sample spaces for encoders and transformers
  20. encoders = []
  21. transformers = []
  22. selectors = [
  23. {"name": "kbest",
  24. "object": SelectPercentile(),
  25. "params": {
  26. "percentile": 3 + hp.randint("kbest__percentile", 60),
  27. "score_func": hp.choice("kbest__score_func",
  28. [f_classif, chi2, mutual_info_classif])}},
  29. {"name": "fpr",
  30. "object": SelectFpr(),
  31. "params": {
  32. "score_func": hp.choice("fpr__score_func",
  33. [f_classif, chi2]),
  34. # mutual_info_classif does not work here
  35. "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
  36. {"name": "rfe_rf",
  37. "object":
  38. RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
  39. "params": {
  40. "n_features_to_select":
  41. 3 + hp.randint("rfe_rf__n_features_to_select", 200),
  42. "estimator__n_estimators":
  43. 20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
  44. {"name": "rfm_rf",
  45. "object":
  46. SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
  47. random_state=33)),
  48. "params": {
  49. "estimator__n_estimators":
  50. 20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
  51. {"name": "rfm_lr",
  52. "object":
  53. SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
  54. random_state=33)),
  55. "params": {
  56. "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
  57. {"name": "std_scaler_pca",
  58. "object": Pipeline([
  59. ("scaler", StandardScaler()),
  60. ("pca", PCA(random_state=33))]),
  61. "params": {
  62. "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
  63. }}
  64. ]
  65. models = [
  66. {"name": "xgb",
  67. "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
  68. "params": {
  69. "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
  70. "max_depth": 3 + hp.randint("xgb__max_depth", 10),
  71. "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
  72. }},
  73. {"name": "rf",
  74. "object": RandomForestClassifier(n_jobs=-1, random_state=33),
  75. "params": {
  76. "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
  77. "max_depth": 3 + hp.randint("rf__max_depth", 10),
  78. "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
  79. }},
  80. # the default solver does not accept l1 penalty
  81. {"name": "lr",
  82. "object": LogisticRegression(random_state=33,
  83. solver='liblinear',
  84. # n_jobs=-1
  85. ),
  86. "params": {
  87. "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
  88. "C": hp.uniform("lr__C", 0.1, 1000)}},
  89. # svc does not support parallelizaiton, therefore is slow
  90. {"name": "svc",
  91. "object": SVC(random_state=33),
  92. "params": {
  93. "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
  94. "degree": 2 + hp.randint("svc__degree", 3),
  95. "C": hp.uniform("svc__C", 0.1, 1000)
  96. }}
  97. ]
  98. step_list = [encoders, transformers, selectors, models]
  99. space = SpaceComposer().compose_hyperopt_space(step_list)