123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Mon Jul 6 14:02:24 2020
- @author: tanya
- @description: space object to pass to HyperoptPipelineSelection class
- """
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
- RFE, SelectFpr, f_classif, chi2, mutual_info_classif
- from xgboost import XGBRFClassifier
- from sklearn.svm import SVC
- from sklearn.linear_model import LogisticRegression
- from sklearn.decomposition import PCA
- from sklearn.pipeline import Pipeline
- from sklearn.preprocessing import StandardScaler
- from hyperopt import hp
- from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
- # TODO: add sample spaces for encoders and transformers
- encoders = []
- transformers = []
- selectors = [
- {"name": "kbest",
- "object": SelectPercentile(),
- "params": {
- "percentile": 3 + hp.randint("kbest__percentile", 60),
- "score_func": hp.choice("kbest__score_func",
- [f_classif, chi2, mutual_info_classif])}},
- {"name": "fpr",
- "object": SelectFpr(),
- "params": {
- "score_func": hp.choice("fpr__score_func",
- [f_classif, chi2]),
- # mutual_info_classif does not work here
- "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
- {"name": "rfe_rf",
- "object":
- RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
- "params": {
- "n_features_to_select":
- 3 + hp.randint("rfe_rf__n_features_to_select", 200),
- "estimator__n_estimators":
- 20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
- {"name": "rfm_rf",
- "object":
- SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
- random_state=33)),
- "params": {
- "estimator__n_estimators":
- 20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
- {"name": "rfm_lr",
- "object":
- SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
- random_state=33)),
- "params": {
- "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
- {"name": "std_scaler_pca",
- "object": Pipeline([
- ("scaler", StandardScaler()),
- ("pca", PCA(random_state=33))]),
- "params": {
- "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
- }}
- ]
- models = [
- {"name": "xgb",
- "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
- "params": {
- "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
- "max_depth": 3 + hp.randint("xgb__max_depth", 10),
- "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
- }},
- {"name": "rf",
- "object": RandomForestClassifier(n_jobs=-1, random_state=33),
- "params": {
- "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
- "max_depth": 3 + hp.randint("rf__max_depth", 10),
- "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
- }},
- # the default solver does not accept l1 penalty
- {"name": "lr",
- "object": LogisticRegression(n_jobs=-1, random_state=33,
- solver='liblinear'),
- "params": {
- "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
- "C": hp.uniform("lr__C", 0.1, 1000)}},
- # svc does not support parallelizaiton, therefore is slow
- {"name": "svc",
- "object": SVC(random_state=33),
- "params": {
- "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
- "degree": 2 + hp.randint("svc__degree", 3),
- "C": hp.uniform("svc__C", 0.1, 1000)
- }}
- ]
- step_list = [encoders, transformers, selectors, models]
- space = SpaceComposer().compose_hyperopt_space(step_list)
|