5 年之前 · 4dc75c35a3
--- a/cdplib/ExceptionsHandler.py
+++ b/cdplib/ExceptionsHandler.py
@@ -0,0 +1,63 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Sep 27 14:20:58 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class ExceptionsHandler:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+
			
 
				+    def check_is_file(self, path):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if not os.path.isfile(path):
			
 
				+            err = "File {} not found".format(path)
			
 
				+            self._log.error(err)
			
 
				+            raise FileNotFoundError(err)
			
 
				+
			
 
				+    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
			
 
				+                               error_or_warning: str, logger = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if logger is None:
			
 
				+            logger = logging.getLogger()
			
 
				+        if isinstance(columns, str):
			
 
				+            columns = [columns]
			
 
				+
			
 
				+        for column in columns:
			
 
				+
			
 
				+            if column not in data.columns:
			
 
				+                err = ("{} is not an internal column name".format(column))
			
 
				+                getattr(logger, error_or_warning)(err)
			
 
				+
			
 
				+                if error_or_warning == "error":
			
 
				+                    raise Exception(err)
			
 
				+
			
 
				+    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._check_column_abscence(columns=columns,
			
 
				+                                           data=data,
			
 
				+                                           error_or_warning="error",
			
 
				+                                           logger=logger)
			
 
				+
			
 
				+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._check_column_abscence(columns=columns,
			
 
				+                                           data=data,
			
 
				+                                           error_or_warning="warning",
			
 
				+                                           logger=logger)
			
--- a/cdplib/configuration.py
+++ b/cdplib/configuration.py
@@ -1,207 +0,0 @@
 
				-"""
			
 
				-@author: Juegen Pannosch (welser project), modified by Tanja Zolotareva
			
 
				-
			
 
				-@description: Here we define a data-structure that contains arguments
			
 
				-used throughout the project. Arguments (like data locations) that can differ
			
 
				-from person to person are loaded from the ./.config file, arguments that should
			
 
				-be the same fro everyone are defined directly in the data structure. All
			
 
				-all changes in this script should be committed to git.
			
 
				-"""
			
 
				-
			
 
				-# -*- coding: utf-8 -*-
			
 
				-import os
			
 
				-import configparser
			
 
				-
			
 
				-
			
 
				-class Configuration:
			
 
				-
			
 
				-    def __init__(self,
			
 
				-                 config_file: str = os.path.join(os.getcwd(), ".env")):
			
 
				-        '''
			
 
				-        '''
			
 
				-        assert isinstance(config_file, str), "the config_file must be a string"
			
 
				-
			
 
				-        assert os.path.isfile(config_file), "config file was not found"
			
 
				-
			
 
				-        self._parse_ini_file(config_file)
			
 
				-
			
 
				-    def __getitem__(self, item):
			
 
				-        '''
			
 
				-        '''
			
 
				-        if item in self._config:
			
 
				-            return self._config[item]
			
 
				-        else:
			
 
				-            return None
			
 
				-
			
 
				-    def _parse_ini_file(self, config_file: str):
			
 
				-        '''
			
 
				-        '''
			
 
				-        self._config = dict()
			
 
				-
			
 
				-        config = configparser.ConfigParser()
			
 
				-        config.read(config_file)
			
 
				-
			
 
				-        for key in config:
			
 
				-            self._config[key] = {}
			
 
				-            sub_config = config[key]
			
 
				-
			
 
				-            for sub_key in sub_config:
			
 
				-                name = sub_key.upper()
			
 
				-                value = sub_config[sub_key]
			
 
				-
			
 
				-                self._config[key][name] = value if (value != '') else None
			
 
				-
			
 
				-    @property
			
 
				-    def labeled_history_folder(self):
			
 
				-        '''
			
 
				-        '''
			
 
				-        return os.path.join(self._config["LOCATIONS"]["DATA_DIR"],
			
 
				-                            "Aufarbeitungsdaten/2018/Datenextrakt einsatzfähige Radsätze 2018")
			
 
				-
			
 
				-    @property
			
 
				-    def unlabeled_history_yearly_folders(self):
			
 
				-        '''
			
 
				-        '''
			
 
				-        folders = []
			
 
				-
			
 
				-        for year in ["2016", "2017", "2018"]:
			
 
				-
			
 
				-            folders.append(os.path.join(self._config["LOCATIONS"]["DATA_DIR"],
			
 
				-                                        "Aufarbeitungsdaten",
			
 
				-                                        year,
			
 
				-                                        "Datenextrakt alle Radsätze {} ausgehend von der Station 110").format(year))
			
 
				-
			
 
				-        return folders
			
 
				-
			
 
				-    @property
			
 
				-    def additional_data_folder(self):
			
 
				-        '''
			
 
				-        '''
			
 
				-        return os.path.join(self._config["LOCATIONS"]["DATA_DIR"],
			
 
				-                            "Info-Austausch")
			
 
				-
			
 
				-    @property
			
 
				-    def columns_rs516(self):
			
 
				-        '''
			
 
				-        '''
			
 
				-        return {0: "radsatznummer",
			
 
				-                1: "positionsnummer",
			
 
				-                2: "status",
			
 
				-                3: "taetigkeitsname",
			
 
				-                4: "datum",
			
 
				-                5: "presskrafdiagram_min",
			
 
				-                6: "presskrafdiagram_max",
			
 
				-                7: "presskrafdiagram_wert"}
			
 
				-
			
 
				-    @property
			
 
				-    def ihs_labels(self):
			
 
				-        '''
			
 
				-        For analysis we replace replace string IHS by an integer value,
			
 
				-        can be useful for comparing IHS of two wheelsets
			
 
				-        '''
			
 
				-
			
 
				-        ihs_labels = {"null": -1,
			
 
				-                      "IS1": 0,
			
 
				-                      "IS1L": 1,
			
 
				-                      "IS2": 2,
			
 
				-                      "IS3": 3}
			
 
				-
			
 
				-        return ihs_labels
			
 
				-
			
 
				-    @property
			
 
				-    def schrott_schadcodes(self):
			
 
				-        '''
			
 
				-        If during the process one of the following schadcodes is assigned,
			
 
				-         then the wheelset is scap and is removed from the process.
			
 
				-         This should correspond to aufarbeitungstyp = 2 in rs0, but if there
			
 
				-         was a delay (or a mistake) in the maintainance of the table
			
 
				-         rs0, this might not be the case. Count as scap anyway.
			
 
				-        '''
			
 
				-        schadcodes_schrott = ["RSAUS"]
			
 
				-
			
 
				-        return schadcodes_schrott
			
 
				-
			
 
				-    @property
			
 
				-    def schrott_taetigkeiten(self):
			
 
				-        '''
			
 
				-        If during the process one of the folling tätigkeiten is assigned,
			
 
				-         then the wheelset is scap and is removed from the process.
			
 
				-         This should correspond to aufarbeitungstyp = 2 in rs0 and (or)
			
 
				-         to assignment of a corresponding schadcode. Data might contain
			
 
				-         inconsistencies. If such an activity is assigned, count as scrap.
			
 
				-        '''
			
 
				-        taetigkeiten_schrott = ["RADSATZ AUSSCHEIDEN"]
			
 
				-
			
 
				-        return taetigkeiten_schrott
			
 
				-
			
 
				-    @property
			
 
				-    def status_labels(self):
			
 
				-        '''
			
 
				-        Used to uniformize the column "Status" in the table rs1,
			
 
				-         integer values convenient for analysis
			
 
				-        '''
			
 
				-        status_labels = {"Scheiden": 2,
			
 
				-                         "Schlecht": 1,
			
 
				-                         "Fertig": 0,
			
 
				-                         "Gut": 0}
			
 
				-
			
 
				-        return status_labels
			
 
				-
			
 
				-    @property
			
 
				-    def process_stages(self):
			
 
				-        '''
			
 
				-        For machine learning predictions we divide the process into
			
 
				-         big stages, stages can be skipped dependeing on the IHS of the
			
 
				-         wheelset. We use all information geathered during the previous
			
 
				-         process stages to make predictions for the next stage.
			
 
				-        '''
			
 
				-        import networkx as nx
			
 
				-
			
 
				-        critical_stations = {"A": [421, 110]}
			
 
				-
			
 
				-        critical_stations["B"] = [130, 131]
			
 
				-
			
 
				-        critical_stations["C"] = [140, 141, 142, 150]
			
 
				-
			
 
				-        critical_stations["D"] = [410, 420]
			
 
				-
			
 
				-        critical_stations["E"] = [510, 511, 520, 521, 535,
			
 
				-                                  530, 531, 516, 550]
			
 
				-
			
 
				-        critical_stations["F"] = [490, 480, 430, 170]
			
 
				-
			
 
				-        critical_stations["G"] = [595, 190, 630]
			
 
				-
			
 
				-        critical_stations["H"] = [640, 641]
			
 
				-
			
 
				-        critical_stations["I"] = [650, 560]
			
 
				-
			
 
				-        critical_stations["J"] = [675]
			
 
				-
			
 
				-        critical_stations["K"] = [690]
			
 
				-
			
 
				-        critical_stations["L"] = [710, 670]
			
 
				-
			
 
				-        stages_graph = nx.DiGraph()
			
 
				-
			
 
				-        for stage in critical_stations:
			
 
				-            stages_graph.add_node(stage, stations=critical_stations[stage])
			
 
				-
			
 
				-        stages_graph.add_edge("A", "B")
			
 
				-        stages_graph.add_edge("B", "C")
			
 
				-        stages_graph.add_edge("C", "D")
			
 
				-        stages_graph.add_edge("D", "E")
			
 
				-        stages_graph.add_edge("D", "F")
			
 
				-        stages_graph.add_edge("E", "G")
			
 
				-        stages_graph.add_edge("F", "G")
			
 
				-        stages_graph.add_edge("G", "H")
			
 
				-        stages_graph.add_edge("H", "I")
			
 
				-        stages_graph.add_edge("I", "J")
			
 
				-        stages_graph.add_edge("J", "K")
			
 
				-        stages_graph.add_edge("K", "L")
			
 
				-
			
 
				-        return stages_graph
			
 
				-
			
 
				-
			
 
				-# singleton
			
 
				-default = Configuration()
			
--- a/cdplib/data_cleaning/DataCleaningUtils.py
+++ b/cdplib/data_cleaning/DataCleaningUtils.py
@@ -0,0 +1,63 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Sep 27 16:20:03 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+class CleaningUtils:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
			
 
				+        '''
			
 
				+        '''
			
 
				+        formats = list(formats)
			
 
				+
			
 
				+        converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
			
 
				+
			
 
				+        for formt in formats:
			
 
				+            if formt == "%d%m%Y":
			
 
				+                missing_leading_zero = (series.astype(str).str.len() == 7)
			
 
				+
			
 
				+                series = series.astype(str)
			
 
				+
			
 
				+                series.loc[missing_leading_zero] = "0" +\
			
 
				+                    series.loc[missing_leading_zero]
			
 
				+
			
 
				+            converted_this_format = pd.to_datetime(series,
			
 
				+                                                   format=formt,
			
 
				+                                                   errors="coerce")
			
 
				+
			
 
				+            converted.fillna(converted_this_format, inplace=True)
			
 
				+
			
 
				+        return converted
			
 
				+
			
 
				+    def standarize_writing(self, s: str, to_lowercase: bool = True):
			
 
				+        '''
			
 
				+        '''
			
 
				+        import re
			
 
				+
			
 
				+        german_character_mapping = {"ß": "ss",
			
 
				+                                    "ü": "ue",
			
 
				+                                    "Ü": "Ue",
			
 
				+                                    "ä": "ae",
			
 
				+                                    "Ä": "Ae",
			
 
				+                                    "ö": "oe",
			
 
				+                                    "Ö": "Oe"}
			
 
				+
			
 
				+        s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
			
 
				+        for char, correct_char in german_character_mapping.items():
			
 
				+            s = s.replace(char, correct_char)
			
 
				+
			
 
				+        if to_lowercase:
			
 
				+            s = s.lower()
			
 
				+
			
 
				+        s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_")
			
 
				+
			
 
				+        return s
			
 
				+
			
--- a/cdplib/db_handlers/MongodbHandler.py
+++ b/cdplib/db_handlers/MongodbHandler.py
@@ -21,34 +21,81 @@ import pandas as pd
 
				 import numpy as np
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				-from cdplib.log import Log
			
 
				-from cdplib.configuration import default as cfg
			
 
				+from libraries.log import Log
			
 
				+from libraries.configuration import default as cfg
			
 
				+from libraries.Singleton_Threadsafe import SingletonThreadsafe
			
 
				 
			
 
				-class MongodbHandler:
			
 
				 
			
 
				+class MongodbHandlerPool(metaclass=SingletonThreadsafe):
			
 
				+    '''
			
 
				     '''
			
 
				 
			
 
				+    def __init__(self, size: int = 10):
			
 
				+        self._size = size
			
 
				+        self._mongodb_handlers = [MongodbHandler() for _ in range(size)]
			
 
				+
			
 
				+    def aquire(self):
			
 
				+        while not self._mongodb_handlers:
			
 
				+            self._mongodb_handlers = [MongodbHandler() for _ in range(self._size)]
			
 
				+            log.warning("Ran out of Mongodb handlers, 10 more have been added. Are you sure you've returned yours?")
			
 
				+        return self._mongodb_handlers.pop()
			
 
				+        
			
 
				+    def release(self, mongodb_handler):
			
 
				+        if len(self._mongodb_handlers) < self._size:
			
 
				+            self._mongodb_handlers.append(mongodb_handler)
			
 
				+
			
 
				+
			
 
				+class MongodbHandler:
			
 
				+
			
 
				     '''
			
 
				 
			
 
				-    def __init__(self, database_url: str = cfg['MONGO_DB']['URI'],
			
 
				-                 database_name: str = cfg['MONGO_DB']['DATABASE_NAME']):
			
 
				+    '''
			
 
				+    pass
			
 
				+    def __init__(self, database_url: str = None,
			
 
				+                 database_name: str = None):
			
 
				         '''
			
 
				         :param str database_url: Url for the mongodb database
			
 
				         :param str database_name: Name of the database the database handler should handle
			
 
				         '''
			
 
				+        if database_url is None:
			
 
				+
			
 
				+            database_url = "mongodb://{0}:{1}@{2}:{3}"\
			
 
				+                           .format(cfg["MONGO"]["MONGO_USER"],
			
 
				+                                   cfg["MONGO"]["MONGO_PASSWORD"],
			
 
				+                                   cfg["MONGO"]["MONGO_HOST"],
			
 
				+                                   cfg["MONGO"]["MONGO_PORT"])
			
 
				+
			
 
				+            if database_name is None:
			
 
				+
			
 
				+                database_name = cfg["MONGO"]["MONGO_DATABASE_NAME"]
			
 
				+
			
 
				         assert(isinstance(database_url, str)),\
			
 
				             "Parameter 'database_url' must be a string type"
			
 
				         assert(isinstance(database_name, str)),\
			
 
				             "Parameter 'database_name' must be a string type"
			
 
				 
			
 
				-        self._log = Log("\nMongodbHandler script")
			
 
				+        self._log = Log("Mongodb Handler")
			
 
				 
			
 
				-        self._log.info('Mongodb Handler has been initialized')
			
 
				         # Connect to the MongoDB
			
 
				         self._client = MongoClient(database_url)
			
 
				         # Connect to the oebb_db database, or create it if it doesnt exist.
			
 
				         self._database = self._client[database_name]
			
 
				 
			
 
				+        self._database_name = database_name
			
 
				+
			
 
				+    def set_database(self, database_name):
			
 
				+        self._database = self._client[database_name]
			
 
				+
			
 
				+    def drop_database(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._client.drop_database(self._database_name)
			
 
				+
			
 
				+    def drop_collection(self, collection_name: str):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._database[collection_name].drop()
			
 
				+
			
 
				     def _read_schema(self, schema_path: str) -> dict:
			
 
				         '''
			
 
				         :param str schema_path: path to the schema file.
			
@@ -60,11 +107,29 @@ class MongodbHandler:
 
				         with open(schema_path) as json_file:
			
 
				             schema = json.load(json_file)
			
 
				 
			
 
				-        if 'definitions' in schema:
			
 
				+        definitions_flag = self._analyze_schema(schema)
			
 
				+
			
 
				+        if definitions_flag:
			
 
				             schema = self._dereference_schema(schema)
			
 
				 
			
 
				         return schema
			
 
				 
			
 
				+    def _analyze_schema(self, schema: dict, definitions_flag: bool = False) -> dict:
			
 
				+
			
 
				+
			
 
				+        for key in schema:
			
 
				+            if key == 'definitions':
			
 
				+                definitions_flag = True
			
 
				+                return definitions_flag
			
 
				+
			
 
				+            if key == 'default' or key == 'default_values':
			
 
				+                return self._remove_defaults(schema)
			
 
				+
			
 
				+            if type(schema[key]) == dict:
			
 
				+                definitions_flag = self._analyze_schema(schema[key], definitions_flag)
			
 
				+
			
 
				+        return definitions_flag
			
 
				+
			
 
				     def _dereference_schema(self, schema: dict) -> dict:
			
 
				         '''
			
 
				         :param dict schema: dictionary containing a schema which uses references.
			
@@ -78,6 +143,20 @@ class MongodbHandler:
 
				         schema.pop('definitions', None)
			
 
				         return schema
			
 
				 
			
 
				+    def _remove_defaults(self, schema: dict) -> dict:
			
 
				+        '''
			
 
				+        :param dict schema: dictionary containing a schema which uses references.
			
 
				+        '''
			
 
				+        if 'default' in schema:
			
 
				+            del schema['default']
			
 
				+        if 'default_values' in schema:
			
 
				+            del schema['default_values']
			
 
				+        return schema
			
 
				+
			
 
				+
			
 
				+        assert(isinstance(schema, dict)),\
			
 
				+            "Parameter 'schema' must be a dictionary type"
			
 
				+
			
 
				     def set_collection_schema(self, collection_name: str, schema_path: str,
			
 
				                               validation_level: str = 'moderate',validation_action: str = 'error'):
			
 
				         '''
			
@@ -152,6 +231,8 @@ class MongodbHandler:
 
				 
			
 
				             if isinstance(data, pd.DataFrame) and (len(data) == 1):
			
 
				                 data = data.iloc[0]
			
 
				+            elif type(data) is list:
			
 
				+                data = data[0]
			
 
				 
			
 
				             self._database[collection_name].insert_one(data)
			
 
				         else:
			
@@ -177,14 +258,32 @@ class MongodbHandler:
 
				         '''
			
 
				 
			
 
				         '''
			
 
				-        if attribute is None or attribute_value is None:
			
 
				+        if attribute == None or attribute_value == None:
			
 
				             data = self._database[collection_name].find()
			
 
				         else:
			
 
				             data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}})
			
 
				 
			
 
				-        df = pd.DataFrame(list(data))
			
 
				-        df.set_index('radsatznummer', inplace=True)
			
 
				-        return df
			
 
				+        if data.count() > 0:
			
 
				+            df = pd.DataFrame(list(data))
			
 
				+            df.set_index('radsatznummer', inplace=True)
			
 
				+            return df
			
 
				+        else:
			
 
				+            self._log.warning(('No data for the query was found').format())
			
 
				+
			
 
				+    def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list):
			
 
				+
			
 
				+        data = list(self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True))
			
 
				+
			
 
				+        if len(data)> 0:
			
 
				+            df = pd.DataFrame(data)
			
 
				+            df.set_index('radsatznummer', inplace=True)
			
 
				+            return df
			
 
				+        else:
			
 
				+            self._log.warning(('No data for the query was found').format())
			
 
				+
			
 
				+
			
 
				+    def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
			
 
				+        self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
--- a/cdplib/db_handlers/SQLHandler.py
+++ b/cdplib/db_handlers/SQLHandler.py
@@ -15,7 +15,25 @@ import pandas as pd
 
				 import warnings
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				+from libraries.Singleton_Threadsafe import SingletonThreadsafe
			
 
				 
			
 
				+class SQLHandlerPool(metaclass=SingletonThreadsafe):
			
 
				+    '''
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self, size: int = 10):
			
 
				+        self._size = size
			
 
				+        self._sql_handlers = [SQLHandler() for _ in range(size)]
			
 
				+
			
 
				+    def aquire(self):
			
 
				+        while not self._sql_handlers:
			
 
				+            self._sql_handlers = [SQLHandler() for _ in range(self._size)]
			
 
				+            log.warning("Ran out of SQL handlers, 10 more have been added. Are you sure you've returned yours?")
			
 
				+        return self._sql_handlers.pop()
			
 
				+        
			
 
				+    def release(self, mongodb_handler):
			
 
				+        if len(self._sql_handlers) < self._size:
			
 
				+            self._sql_handlers.append(mongodb_handler)
			
 
				 
			
 
				 class SQLHandler:
			
 
				     '''
			
@@ -25,7 +43,7 @@ class SQLHandler:
 
				     closing of a database connection,
			
 
				      this avoids errors when parallelizing with multiprocessing.
			
 
				     '''
			
 
				-
			
 
				+    pass
			
 
				     def __init__(self, db_uri: str = None,
			
 
				                  is_case_insensitive: bool = False):
			
 
				         '''
			
@@ -45,7 +63,13 @@ class SQLHandler:
 
				         self._log = Log(name='SQLHandler')
			
 
				 
			
 
				         if db_uri is None:
			
 
				-            db_uri = cfg["SQL_DB"]["URI"]
			
 
				+
			
 
				+            db_uri = "mysql+pymysql://{0}:{1}@{2}:{3}/{4}?charset=utf8&local_infile=1"\
			
 
				+                     .format(cfg["SQL"]["SQL_USER"],
			
 
				+                             cfg["SQL"]["SQL_PASSWORD"],
			
 
				+                             cfg["SQL"]["SQL_HOST"],
			
 
				+                             cfg["SQL"]["SQL_PORT"],
			
 
				+                             cfg["SQL"]["SQL_DATABASE_NAME"])
			
 
				 
			
 
				         assert(isinstance(db_uri, str)),\
			
 
				             "Parameter 'db_uri' must be of type str"
			
@@ -76,7 +100,7 @@ class SQLHandler:
 
				 
			
 
				         self._is_case_insensitive = is_case_insensitive
			
 
				 
			
 
				-        self._engine = sqlalchemy.create_engine(self._db_uri)
			
 
				+        self._engine = engine
			
 
				 
			
 
				     @property
			
 
				     def _connection_params(self) -> dict:
			
@@ -117,7 +141,9 @@ class SQLHandler:
 
				         '''
			
 
				         database = self._connection_params["db"]
			
 
				         self.execute("DROP DATABASE IF EXISTS {}".format(database))
			
 
				-
			
 
				+        self._engine.execute("CREATE DATABASE {}".format(database))
			
 
				+        self._engine.execute("USE {}".format(database))
			
 
				+        
			
 
				     @property
			
 
				     def _db_metadata(self) -> dict:
			
 
				         '''
			
@@ -177,7 +203,7 @@ class SQLHandler:
 
				         '''
			
 
				         connection = self._engine.connect()
			
 
				         transaction = connection.begin()
			
 
				-
			
 
				+    
			
 
				         errors = []
			
 
				 
			
 
				         # in the case of multi-query execute each query
			
@@ -242,7 +268,7 @@ class SQLHandler:
 
				 
			
 
				     def check_if_table_exists(self, tablename: str,
			
 
				                               schema: str = None,
			
 
				-                              query: str = None):
			
 
				+                              query: str = None) -> bool:
			
 
				         '''
			
 
				         Tries to retrieve table information from database with given query.
			
 
				         If this does not work, tries to select one row from the given table,
			
@@ -478,7 +504,7 @@ class SQLHandler:
 
				             data = pd.read_sql(sql=query,
			
 
				                                con=connection,
			
 
				                                **read_sql_kwargs)
			
 
				-
			
 
				+            #self._engine.dispose()
			
 
				             connection.close()
			
 
				             return data
			
 
				 
			
@@ -528,16 +554,24 @@ class SQLHandler:
 
				         try:
			
 
				             connection = self._engine.connect()
			
 
				 
			
 
				-            data.to_sql(name=tablename,
			
 
				-                        schema=schema,
			
 
				-                        con=connection,
			
 
				-                        if_exists='append',
			
 
				-                        **to_sql_kwargs)
			
 
				+            if self.check_if_table_exists(tablename=tablename, schema=schema):
			
 
				+
			
 
				+                data.to_sql(name=tablename,
			
 
				+                            schema=schema,
			
 
				+                            con=connection,
			
 
				+                            if_exists='append',
			
 
				+                            **to_sql_kwargs)
			
 
				+            else:
			
 
				+
			
 
				+                self.overwrite_table(data=data,
			
 
				+                                     tablename=tablename,
			
 
				+                                     schema=schema,
			
 
				+                                     to_sql_kwargs=to_sql_kwargs)
			
 
				 
			
 
				             connection.close()
			
 
				 
			
 
				         except Exception as e:
			
 
				-            err = ("Could append data to the table {0}. "
			
 
				+            err = ("Could not append data to the table {0}. "
			
 
				                    "Finished with error {1}").format(tablename, e)
			
 
				 
			
 
				             self._log.error(err)
			
@@ -566,7 +600,6 @@ class SQLHandler:
 
				                         con=connection,
			
 
				                         if_exists='replace',
			
 
				                         **to_sql_kwargs)
			
 
				-
			
 
				             connection.close()
			
 
				 
			
 
				         except Exception as e:
			
--- a/cdplib/db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
+++ b/cdplib/db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
--- a/cdplib/db_handlers/__pycache__/SQLHandler.cpython-37.pyc
+++ b/cdplib/db_handlers/__pycache__/SQLHandler.cpython-37.pyc
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
@@ -10,16 +10,17 @@ Created on Mon Jul 22 11:05:47 2019
 
				 """
			
 
				 
			
 
				 import pandas as pd
			
 
				+import numpy as np
			
 
				 import os
			
 
				 import sys
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
 
				 
			
 
				-class DataFrameToCollection:
			
 
				+class DataFrameToCollection():
			
 
				     '''
			
 
				     '''
			
 
				-    def __init__(self, schema_path: str = None, log_path: str = None):
			
 
				+    def __init__(self, schema_path: str):
			
 
				         '''
			
 
				         '''
			
 
				         from libraries.log import Log
			
@@ -27,32 +28,29 @@ class DataFrameToCollection:
 
				 
			
 
				         self._log = Log("ParseJsonSchema")
			
 
				 
			
 
				-        if schema_path is not None:
			
 
				 
			
 
				-            if not os.path.isfile(schema_path):
			
 
				-                err = "JsonSchema not found"
			
 
				-                self._log.error(err)
			
 
				-                raise FileNotFoundError(err)
			
 
				+        if not os.path.isfile(schema_path):
			
 
				+            err = "JsonSchema not found"
			
 
				+            self._log.error(err)
			
 
				+            raise FileNotFoundError(err)
			
 
				 
			
 
				-            # load schema to dictionary if it is a valid json file
			
 
				-            try:
			
 
				-                with open(schema_path, "r") as f:
			
 
				-                    self.schema = json.load(f)
			
 
				+        # load schema to dictionary if it is a valid json file
			
 
				+        try:
			
 
				+            with open(schema_path, "r") as f:
			
 
				+                self.schema = json.load(f)
			
 
				 
			
 
				-            except Exception as e:
			
 
				-                err = ("Could not load json schema, "
			
 
				-                       "Obtained error {}".format(e))
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not load json schema, "
			
 
				+                   "Obtained error {}".format(e))
			
 
				 
			
 
				-                self._log.error(err)
			
 
				-                raise Exception(err)
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				 
			
 
				-        else:
			
 
				-            self.schema = None
			
 
				 
			
 
				     def to_list_of_documents(self, data: pd.DataFrame,
			
 
				                              grp_fields: list,
			
 
				                              schema: dict = None,
			
 
				-                             _return_data: bool = False) -> list:
			
 
				+                             _final_step: bool = True) -> list:
			
 
				         '''
			
 
				         Reshapes a pandas dataframe to a list of documents according
			
 
				          to a complex (json) mongodb schema
			
@@ -63,11 +61,14 @@ class DataFrameToCollection:
 
				 
			
 
				          Remark2: if the schema is stored as a json file, first load it
			
 
				          to a dictionary with the help of the python json module
			
 
				+
			
 
				+         The function goes recurisively through all the fields and reshapes
			
 
				+         them correspondingly depending on whether the field is an array,
			
 
				+         an object, or simple field. For each field we group the data by the
			
 
				+         grp_fields and reshape it accordingly, the result is a pandas Series.
			
 
				+         In the end all the series are collected and concatenated.
			
 
				         '''
			
 
				         from copy import deepcopy
			
 
				-        from libraries.log import Log
			
 
				-
			
 
				-        log = Log("reshape_dataframe_to_list_of_documents:")
			
 
				 
			
 
				         data = self._melt_duplicated_columns(data)
			
 
				 
			
@@ -88,18 +89,18 @@ class DataFrameToCollection:
 
				 
			
 
				                 grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				 
			
 
				-                n_distinct_values = data.groupby(grp_fields)[field].nunique()\
			
 
				-                                        .max()
			
 
				+                # check that there is only one possible value of this field
			
 
				+                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
			
 
				 
			
 
				                 if n_distinct_values != 1:
			
 
				                     err = "Field {0} is not unique with respect to {1}"\
			
 
				                           .format(field, grp_fields)
			
 
				 
			
 
				-                    log.error(err)
			
 
				+                    self._log.error(err)
			
 
				                     raise Exception(err)
			
 
				 
			
 
				                 if field not in grp_fields:
			
 
				-                    reshaped_field = data.groupby(grp_fields)[field].first()
			
 
				+                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
			
 
				                 else:
			
 
				                     reshaped_field =\
			
 
				                         data[grp_fields].drop_duplicates()\
			
@@ -121,7 +122,7 @@ class DataFrameToCollection:
 
				                             data=data,
			
 
				                             schema=sub_schema,
			
 
				                             grp_fields=grp_fields,
			
 
				-                            _return_data=True)
			
 
				+                            _final_step=False)
			
 
				 
			
 
				                 reshaped_field = sub_data.apply(self._make_dict, axis=1)
			
 
				                 reshaped_field.name = field
			
@@ -143,17 +144,16 @@ class DataFrameToCollection:
 
				                          sub_schema["properties"].items()}
			
 
				 
			
 
				                     # extend grp fields by sub-fields of field simple types
			
 
				-                    sub_grp_fields =\
			
 
				-                        [f for f in sub_schema["properties"]
			
 
				-                         if sub_schema["properties"][f]["bsonType"]
			
 
				-                         not in ["array", "object"]]
			
 
				+                    sub_grp_fields = [f for f in sub_schema["properties"]
			
 
				+                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
			
 
				+                                      and (f in data.columns)]
			
 
				 
			
 
				                     if len(sub_grp_fields) == 0:
			
 
				                         err = ("One of the sub-keys in a list of documents"
			
 
				                                " must be of simple type for the field {}"
			
 
				                                .format(field))
			
 
				 
			
 
				-                        log.error(err)
			
 
				+                        self._log.error(err)
			
 
				                         raise Exception(err)
			
 
				 
			
 
				                     # group and reshape sub-fields with complex types
			
@@ -161,7 +161,7 @@ class DataFrameToCollection:
 
				                                 data=data,
			
 
				                                 schema=sub_schema,
			
 
				                                 grp_fields=grp_fields + sub_grp_fields,
			
 
				-                                _return_data=True)
			
 
				+                                _final_step=False)
			
 
				 
			
 
				                     if sub_data is not None:
			
 
				 
			
@@ -172,61 +172,86 @@ class DataFrameToCollection:
 
				                         sub_data = sub_data.reset_index(grp_fields)
			
 
				 
			
 
				                         reshaped_field =\
			
 
				-                            sub_data.groupby(grp_fields)[field]\
			
 
				+                            sub_data.groupby(grp_fields, sort=False)[field]\
			
 
				                                     .apply(self._make_list_of_distinct)
			
 
				 
			
 
				                         reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				                 # if field is a list of values with simple type
			
 
				+                elif items_type == "array":
			
 
				+
			
 
				+                    grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				+
			
 
				+                    if field in data.columns:
			
 
				+
			
 
				+                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				+                                             .apply(self._make_list_of_distinct)
			
 
				+
			
 
				+                        reshaped_fields.append(reshaped_field)
			
 
				+
			
 
				                 else:
			
 
				 
			
 
				                     grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				 
			
 
				                     if field in data.columns:
			
 
				 
			
 
				-                        reshaped_field = data.groupby(grp_fields)[field]\
			
 
				-                                           .apply(self._make_list_of_distinct)
			
 
				+                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				+                                             .apply(self._make_flattened_list_of_distinct)
			
 
				 
			
 
				                         reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				         if len(reshaped_fields) > 0:
			
 
				-            reshaped_data = pd.concat(reshaped_fields, axis=1)
			
 
				-
			
 
				-            if not _return_data:
			
 
				 
			
 
				-                list_of_documents =\
			
 
				-                    reshaped_data.drop(list(reshaped_data.index.names),
			
 
				-                                       axis=1, errors="ignore")\
			
 
				-                                 .reset_index(drop=False)
			
 
				+            reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
			
 
				 
			
 
				-                log.info("Done reshaping the dataframe to a list of documents")
			
 
				+            if _final_step:
			
 
				+                # dropping the index names if it is the final step,
			
 
				+                # if not the index is needed for merging
			
 
				+                reshaped_fields =\
			
 
				+                    reshaped_fields.drop(list(reshaped_fields.index.names), axis=1, errors="ignore")\
			
 
				+                                   .reset_index(drop=False)
			
 
				 
			
 
				-                return list_of_documents
			
 
				+                self._log.info("Done reshaping the dataframe to a list of documents")
			
 
				 
			
 
				-            else:
			
 
				+            return reshaped_fields
			
 
				 
			
 
				-                return reshaped_data
			
 
				+        else:
			
 
				+            return
			
 
				 
			
 
				     def _melt_duplicated_columns(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				         '''
			
 
				         '''
			
 
				+        data = data.copy(deep=True)
			
 
				+
			
 
				         for c in set(data.columns):
			
 
				             if isinstance(data[c], pd.DataFrame):
			
 
				+                """
			
 
				                 data = pd.melt(data, id_vars=[cc for cc in data.columns
			
 
				                                               if cc != c], value_vars=c)\
			
 
				                          .drop("variable", axis=1)\
			
 
				                          .rename(columns={"value": c})
			
 
				+                """
			
 
				+                data["temp"] = data[c].apply(self._make_list, axis=1)
			
 
				+                data.drop(c, axis=1, inplace=True)
			
 
				+                data = data.rename(columns={"temp": c})
			
 
				 
			
 
				         return data
			
 
				 
			
 
				     def _make_dict(self, x: pd.Series) -> dict:
			
 
				         '''
			
 
				-        return: transforms pandas series to a dictionary
			
 
				+        Transforms pandas series to a dictionary
			
 
				          is meant to be applied to a dataframe in axis = 1,
			
 
				          then the index of the input series are the column names
			
 
				          of the dataframe
			
 
				         '''
			
 
				-        return {f.split(".")[-1]: x[f] for f in x.index}
			
 
				+        def custom_is_null(y):
			
 
				+            if isinstance(pd.notnull(y), bool):
			
 
				+                return pd.notnull(y)
			
 
				+            else:
			
 
				+                return True
			
 
				+
			
 
				+        return {f.split(".")[-1]: x[f] for f in x.index
			
 
				+                if custom_is_null(x[f])}
			
 
				 
			
 
				     def _make_list(self, x: pd.Series) -> list:
			
 
				         '''
			
@@ -240,16 +265,35 @@ class DataFrameToCollection:
 
				          entries are arbitrary objects
			
 
				          (pandas unique() method does not work if entries are of complex types)
			
 
				         '''
			
 
				-        distinct = []
			
 
				-        [distinct.append(obj) for obj in x if obj not in distinct]
			
 
				-        return distinct
			
 
				+        uniques = pd.DataFrame({"temp": x.tolist()})\
			
 
				+                    .assign(temp_str=lambda y: y["temp"].astype(str))\
			
 
				+                    .drop_duplicates(subset=["temp_str"])\
			
 
				+                    .drop("temp_str", axis=1).iloc[:, 0].tolist()
			
 
				+
			
 
				+        def is_empty(y):
			
 
				+            is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
			
 
				+            is_empty_list = (isinstance(y, list) and (len(y) == 0))
			
 
				+            return is_empty_dict or is_empty_list
			
 
				+
			
 
				+        return [el for el in uniques if not is_empty(el)]
			
 
				+
			
 
				+    def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
			
 
				+        '''
			
 
				+        return: list of unique values from a Series where
			
 
				+         entries are arbitrary objects
			
 
				+         (pandas unique() method does not work if entries are of complex types)
			
 
				+        '''
			
 
				+        uniques = self._make_list_of_distinct(x)
			
 
				+        return uniques[0]
			
 
				 
			
 
				-    def _unroll_nested_names(self, columns: list) -> list:
			
 
				+    def _unroll_nested_names(self, names: list) -> list:
			
 
				         '''
			
 
				+        Example: transform a list ["name.firstname", "name.surname"]
			
 
				+        into ["name", "name.firstname", "name.surname"]
			
 
				         '''
			
 
				         unrolled = []
			
 
				 
			
 
				-        for c in columns:
			
 
				+        for c in names:
			
 
				             splitted = c.split(".")
			
 
				             for i in range(len(splitted)):
			
 
				                 unrolled.append(".".join(splitted[:i+1]))
			
--- a/cdplib/db_migration/FlattenData.py
+++ b/cdplib/db_migration/FlattenData.py
@@ -0,0 +1,156 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Oct 9 15:17:34 2019
			
 
				+
			
 
				+@author: oskar
			
 
				+@description: Class which flattens nested Dataframes, Dictionaries and Lists into tabular form
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+import time
			
 
				+import pandas as pd
			
 
				+import copy
			
 
				+sys.path.append(os.getcwd())
			
 
				+from libraries.log import Log
			
 
				+log = Log("Flatten data")
			
 
				+
			
 
				+class FlattenData():
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        log.info('Flatten Data Initialized')
			
 
				+
			
 
				+    def flatten(self, data):
			
 
				+        '''
			
 
				+        :parm data: data given in either dictionary, list or dataframe format.
			
 
				+        '''
			
 
				+
			
 
				+        assert(isinstance(data, (list, dict, pd.DataFrame))),\
			
 
				+            "Parameter 'data' either be of List, Dictionary or DataFrame type"
			
 
				+
			
 
				+        start = time.time()
			
 
				+        if type(data) is pd.DataFrame:
			
 
				+            return_data = self.flatten_dataframe(data)
			
 
				+            print(('Data has been flattened in {} seconds').format(time.time()-start))
			
 
				+            return return_data
			
 
				+        if type(data) is dict:
			
 
				+            return self.flatten_dict(data)
			
 
				+        if type(data) is list:
			
 
				+            return self.flatten_list(data)
			
 
				+
			
 
				+    def flatten_dataframe(self, dataframe: pd.DataFrame, incoming_key: str = None):
			
 
				+        '''
			
 
				+        :param pd.Dataframe dataframe: dataframe containing the data to be flattened
			
 
				+        :param str incoming_key: string to be appended to the key
			
 
				+        '''
			
 
				+        assert(isinstance(dataframe, pd.DataFrame)),\
			
 
				+            "Parameter 'dataframe' be of DataFrame type"
			
 
				+        assert(isinstance(incoming_key, str)),\
			
 
				+            "Parameter 'incoming_key' be of String type"
			
 
				+
			
 
				+        result_dict = {}
			
 
				+        for index, row in dataframe.iterrows():
			
 
				+            temp_result_dict = {}
			
 
				+            for key, value in row.iteritems():
			
 
				+                temp_result = {}
			
 
				+                if incoming_key is not None:
			
 
				+                    key = incoming_key + '_' + key
			
 
				+                if type(value) == list:
			
 
				+                    temp_result = self.flatten_list(value, key)
			
 
				+                elif type(value) == dict:
			
 
				+                    temp_result = self.flatten_dict(value, key)
			
 
				+                else:
			
 
				+                    temp_result_dict[key] = value
			
 
				+
			
 
				+                if len(temp_result) > 0:
			
 
				+                    result_dict = self.append_to_dict(result_dict, temp_result)
			
 
				+
			
 
				+            result_dict[index] = copy.deepcopy(temp_result_dict)
			
 
				+        
			
 
				+        result_dataframe = pd.DataFrame.from_dict(result_dict, orient='index')
			
 
				+        return result_dataframe
			
 
				+
			
 
				+    def flatten_dict(self, dictionary: dict, incoming_key: str = None):
			
 
				+        '''
			
 
				+        :param dict dictionary: dictionary containing the data to be flattened
			
 
				+        :param str incoming_key: string to be appended to the key
			
 
				+        '''
			
 
				+        assert(isinstance(dictionary, pd.DataFrame)),\
			
 
				+            "Parameter 'dictionary' be of Dictionary type"
			
 
				+        assert(isinstance(incoming_key, str)),\
			
 
				+            "Parameter 'incoming_key' be of String type"
			
 
				+
			
 
				+
			
 
				+        result_dict = {}
			
 
				+        for key in dictionary:
			
 
				+
			
 
				+            temp_dataframe = dictionary[key]
			
 
				+            temp_result = {}
			
 
				+            if incoming_key is not None:
			
 
				+                key = incoming_key + '_' + key
			
 
				+            if type(temp_dataframe) == list:
			
 
				+                temp_result = self.flatten_list(temp_dataframe, key)
			
 
				+            elif type(temp_dataframe) == dict:
			
 
				+                temp_result = self.flatten_dict(temp_dataframe, key)
			
 
				+            else:
			
 
				+                result_dict[key] = temp_dataframe
			
 
				+
			
 
				+            if len(temp_result) > 0:
			
 
				+                result_dict = self.append_to_dict(result_dict, temp_result)
			
 
				+
			
 
				+        return result_dict
			
 
				+
			
 
				+    def flatten_list(self, data_list: list, incoming_key: str = None):
			
 
				+        '''
			
 
				+        :param list data_list: list containing the data to be flattened
			
 
				+        :param str incoming_key: string to be appended to the key
			
 
				+        '''
			
 
				+
			
 
				+        assert(isinstance(data_list, pd.DataFrame)),\
			
 
				+            "Parameter 'data_list' be of List type"
			
 
				+        assert(isinstance(incoming_key, str)),\
			
 
				+            "Parameter 'incoming_key' be of String type"
			
 
				+
			
 
				+        result_dict = {}
			
 
				+        for iteration, item in enumerate(data_list):
			
 
				+
			
 
				+            temp_dataframe = item
			
 
				+            temp_result = {}
			
 
				+            key = incoming_key
			
 
				+            if incoming_key is not None:
			
 
				+                if type(data_list[iteration]) is dict:
			
 
				+                    if 'stationsnummer' in data_list[iteration].keys() and 'stage' in data_list[iteration].keys() :
			
 
				+                        
			
 
				+                        key = incoming_key + '_' + str(data_list[iteration]['stationsnummer']) + '_' + str(data_list[iteration]['stage'])
			
 
				+                else:
			
 
				+                    key = incoming_key + '_' + str(iteration)
			
 
				+            if type(temp_dataframe) == list:
			
 
				+                temp_result = self.flatten_list(temp_dataframe, key)
			
 
				+                result_dict = self.append_to_dict(result_dict, temp_result)
			
 
				+            
			
 
				+            elif type(temp_dataframe) == dict:
			
 
				+                temp_result = self.flatten_dict(temp_dataframe, key)
			
 
				+                result_dict = self.append_to_dict(result_dict, temp_result)
			
 
				+            else:
			
 
				+                result_dict[key] = temp_dataframe
			
 
				+
			
 
				+            if len(temp_result) > 0:
			
 
				+                result_dict = self.append_to_dict(result_dict, temp_result)
			
 
				+                
			
 
				+        return result_dict
			
 
				+
			
 
				+    def append_to_dict(self, dictionary: dict, to_append):
			
 
				+        '''
			
 
				+        :param dict dictionary: dictionary which holds all the resulting data.
			
 
				+        :param dict to_append: data to be added to the resulting dictionary.
			
 
				+        '''
			
 
				+        assert(isinstance(dictionary, (list, dict))),\
			
 
				+            "Parameter 'dictionary' be of Dictionary type"
			
 
				+        assert(isinstance(to_append, dict)),\
			
 
				+            "Parameter 'to_append' be of Dictionary type"
			
 
				+
			
 
				+        for key in to_append:
			
 
				+            dictionary[key] = to_append[key]
			
 
				+        
			
 
				+        return dictionary
			
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -16,11 +16,11 @@ sys.path.append(os.getcwd())
 
				 
			
 
				 from libraries.db_migration.ParseMapping import ParseMapping
			
 
				 from libraries.db_migration.ParseJsonSchema import ParseJsonSchema
			
 
				-from libraries.utils.ClassLogging import ClassLogging
			
 
				+from libraries.utils.ExceptionsHandler import ExceptionsHandler
			
 
				 from libraries.utils.CleaningUtils import CleaningUtils
			
 
				+from libraries.log import Log
			
 
				 
			
 
				-
			
 
				-class MigrationCleaning(ClassLogging):
			
 
				+class MigrationCleaning:
			
 
				     '''
			
 
				     Class for correcting and filtering the incorrect data.
			
 
				     We keep the correcting and the filtering methods separated,
			
@@ -33,12 +33,12 @@ class MigrationCleaning(ClassLogging):
 
				                  mapping_source: str = "internal_name",
			
 
				                  mapping_target: str = "mongo_name",
			
 
				                  mapping_parser: type = ParseMapping,
			
 
				-                 schema_parser: type = ParseJsonSchema,
			
 
				-                 log_name: str = "MigrationCleaning"):
			
 
				+                 schema_parser: type = ParseJsonSchema):
			
 
				         '''
			
 
				         '''
			
 
				-        super().__init__(log_name=log_name)
			
 
				-
			
 
				+        self._log = Log('Migration Cleaning')
			
 
				+        self._exception_handler = ExceptionsHandler()
			
 
				+        
			
 
				         assert isinstance(inconsist_report_table, str),\
			
 
				             "Inconsistent report table should be a tablename string"
			
 
				 
			
@@ -58,6 +58,9 @@ class MigrationCleaning(ClassLogging):
 
				         self._mapping_path = mapping_path
			
 
				         self._schema_paths = schema_paths
			
 
				 
			
 
				+        from libraries.db_handlers.SQLHandler import SQLHandlerPool
			
 
				+        self._sql_db = SQLHandlerPool(20)
			
 
				+
			
 
				     def _assert_dataframe_input(self, data: pd.DataFrame):
			
 
				         '''
			
 
				         '''
			
@@ -220,19 +223,26 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				         data = data.copy(deep=True)
			
 
				 
			
 
				-        db = SQLHandler()
			
 
				+        db = self._sql_db.aquire()#SQLHandler()
			
 
				 
			
 
				         if invalid_mask.sum() == 0:
			
 
				 
			
 
				+            self._sql_db.release(db)
			
 
				             return data
			
 
				 
			
 
				         data_inconsist = data.assign(reason=reason)\
			
 
				                              .loc[invalid_mask]\
			
 
				                              .reset_index(drop=True)
			
 
				 
			
 
				+        if db.check_if_table_exists(self._inconsist_report_table):
			
 
				+            columns = db.get_column_names(tablename=self._inconsist_report_table)
			
 
				+
			
 
				+            if len(columns) > 0:
			
 
				+                data_inconsist = data_inconsist[columns]
			
 
				+
			
 
				         db.append_to_table(data=data_inconsist,
			
 
				                            tablename=self._inconsist_report_table)
			
 
				-
			
 
				+       
			
 
				         n_rows_filtered = len(data_inconsist)
			
 
				         n_instances_filtered = len(data_inconsist[self._filter_index_columns].drop_duplicates())
			
 
				 
			
@@ -255,6 +265,8 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				         data = data.loc[~all_index.isin(nok_index)].reset_index(drop=True)
			
 
				 
			
 
				+        self._sql_db.release(db)
			
 
				+
			
 
				         return data
			
 
				 
			
 
				     def _replace_values(self, data: pd.DataFrame,
			
@@ -303,7 +315,7 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				             except Exception as e:
			
 
				 
			
 
				-                self.log_and_raise(("Failed to replace {0} values "
			
 
				+                self._exception_handler.log_and_raise(("Failed to replace {0} values "
			
 
				                                     "in {1}. Exit with error {2}"
			
 
				                                     .format(default_str, column, e)))
			
 
				 
			
@@ -350,6 +362,7 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				                 else:
			
 
				 
			
 
				+                    data = data.copy(deep=True)
			
 
				                     data[column] = data[column].astype(python_type)
			
 
				 
			
 
				                 if data[column].dtype != python_type:
			
@@ -363,7 +376,7 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				             except Exception as e:
			
 
				 
			
 
				-                self.log_and_raise(("Failed to convert types in {0}. "
			
 
				+                self._exception_handler.log_and_raise(("Failed to convert types in {0}. "
			
 
				                                     "Exit with error {1}"
			
 
				                                     .format(column, e)))
			
 
				 
			
@@ -371,7 +384,7 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				         return data
			
 
				 
			
 
				-    def filter_invalid_null_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+    def filter_invalid_missing_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				         '''
			
 
				         '''
			
 
				         self._assert_dataframe_input(data)
			
@@ -397,7 +410,12 @@ class MigrationCleaning(ClassLogging):
 
				         '''
			
 
				         self._assert_dataframe_input(data)
			
 
				 
			
 
				-        for column, python_type in self._python_types.items():
			
 
				+        for column in data.columns:
			
 
				+
			
 
				+            if column not in self._python_types:
			
 
				+                continue
			
 
				+
			
 
				+            python_type = self._python_types[column]
			
 
				 
			
 
				             if data[column].dtype != python_type:
			
 
				 
			
@@ -419,7 +437,12 @@ class MigrationCleaning(ClassLogging):
 
				         '''
			
 
				         self._assert_dataframe_input(data)
			
 
				 
			
 
				-        for column, pattern in self._patterns:
			
 
				+        for column in data.columns:
			
 
				+
			
 
				+            if column not in self._patterns:
			
 
				+                continue
			
 
				+
			
 
				+            pattern = self._patterns[column]
			
 
				 
			
 
				             invalid_mask = (~data[column].astype(str).str.match(pattern))
			
 
				 
			
@@ -431,41 +454,64 @@ class MigrationCleaning(ClassLogging):
 
				 
			
 
				         return data
			
 
				 
			
 
				-    def filter_notallowed_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+    def filter_invalid_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				         '''
			
 
				         '''
			
 
				-        for column, value in self._minimum_values.items():
			
 
				+        for column in data.columns:
			
 
				 
			
 
				-            invalid_mask = data[column] > value
			
 
				+            if column in self._minimum_values:
			
 
				 
			
 
				-            reason = "Too large values in field {}".format(column)
			
 
				+                min_value = self._minimum_values[column]
			
 
				 
			
 
				-            data = self._filter_invalid_data(data=data,
			
 
				-                                             invalid_mask=invalid_mask,
			
 
				-                                             reason=reason)
			
 
				+                invalid_mask = data[column] > min_value
			
 
				 
			
 
				-        for column, value in self._maximum_values.items():
			
 
				+                reason = "Too large values in field {}".format(column)
			
 
				 
			
 
				-            invalid_mask = data[column] < value
			
 
				+                data = self._filter_invalid_data(data=data,
			
 
				+                                                 invalid_mask=invalid_mask,
			
 
				+                                                 reason=reason)
			
 
				 
			
 
				-            reason = "Too small values in field {}".format(column)
			
 
				+            elif column in self._maximum_values:
			
 
				 
			
 
				-            data = self._filter_invalid_data(data=data,
			
 
				-                                             invalid_mask=invalid_mask,
			
 
				-                                             reason=reason)
			
 
				+                max_value = self._maximum_values[column]
			
 
				 
			
 
				-        for column, allowed_values in self._allowed_values.items():
			
 
				+                invalid_mask = data[column] < max_value
			
 
				 
			
 
				-            invalid_mask = (~data[column].isin(allowed_values))
			
 
				+                reason = "Too small values in field {}".format(column)
			
 
				 
			
 
				-            reason = "Too small values in field {}".format(column)
			
 
				+                data = self._filter_invalid_data(data=data,
			
 
				+                                                 invalid_mask=invalid_mask,
			
 
				+                                                 reason=reason)
			
 
				 
			
 
				-            data = self._filter_invalid_data(data=data,
			
 
				-                                             invalid_mask=invalid_mask,
			
 
				-                                             reason=reason)
			
 
				+            elif column in self._allowed_values:
			
 
				+
			
 
				+                allowed_values = self._allowed_values[column]
			
 
				+
			
 
				+                invalid_mask = (~data[column].isin(allowed_values))
			
 
				+
			
 
				+                not_allowed_examples = data.loc[invalid_mask, column].unique()[:3]
			
 
				+
			
 
				+                reason = "Not allowed values {0}... in field {1}"\
			
 
				+                         .format(not_allowed_examples, column)
			
 
				+
			
 
				+                data = self._filter_invalid_data(data=data,
			
 
				+                                                 invalid_mask=invalid_mask,
			
 
				+                                                 reason=reason)
			
 
				+
			
 
				+            else:
			
 
				+                continue
			
 
				 
			
 
				         return data
			
 
				 
			
 
				+    def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        mongo_fields = self._schema_parser.get_fields_restricted_to_collection(collection_name=collection_name)
			
 
				+
			
 
				+        fields = self._mapping_parser.get_fields_restricted_to_collecton(collection_name=collection_name)
			
 
				+
			
 
				+        return data[[c for c in data.columns if (c in fields) or (c in mongo_fields)]]
			
 
				+
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				 
			
@@ -483,8 +529,6 @@ if __name__ == "__main__":
 
				 
			
 
				     if all([os.path.isfile(p) for p in schema_paths + [mapping_path]]):
			
 
				 
			
 
				-        print("Found schemas!")
			
 
				-
			
 
				         cleaner = MigrationCleaning(
			
 
				                 mapping_path=mapping_path,
			
 
				                 schema_paths=schema_paths,
			
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
@@ -39,6 +39,8 @@ class ParseJsonSchema(ParseDbSchema):
 
				         if isinstance(schema_paths, str):
			
 
				             schema_paths = [schema_paths]
			
 
				 
			
 
				+        self._schema_paths = schema_paths
			
 
				+
			
 
				         self.schemas = []
			
 
				 
			
 
				         for schema_path in schema_paths:
			
@@ -53,11 +55,25 @@ class ParseJsonSchema(ParseDbSchema):
 
				                 self._log.error(err)
			
 
				                 raise Exception(err)
			
 
				 
			
 
				+    @property
			
 
				+    def _collection_names(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        # Don't use strip() instaed of replace since schema_c.strip(schema_)
			
 
				+        # will discard the c as well which is not a appropriate output
			
 
				+        return [os.path.basename(p).replace("schema_","").split(".")[0] for p in self._schema_paths]
			
 
				+
			
 
				     def get_fields(self) -> list:
			
 
				         '''
			
 
				         '''
			
 
				         return self._parse()
			
 
				 
			
 
				+    def get_fields_restricted_to_collection(self, collection_name: str) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        schemas = [self.schemas[self._collection_names.index(collection_name)]]
			
 
				+        return self._parse(schemas=schemas)
			
 
				+
			
 
				     def get_required_fields(self) -> list:
			
 
				         '''
			
 
				         '''
			
@@ -82,17 +98,17 @@ class ParseJsonSchema(ParseDbSchema):
 
				         mongo_types = self.get_mongo_types()
			
 
				         python_types = {}
			
 
				 
			
 
				-        bson_to_python_types_except_dates = {"double": float,
			
 
				-                                             "decimal": float,
			
 
				-                                             "string": str,
			
 
				-                                             "object": object,
			
 
				-                                             "array": list,
			
 
				-                                             "bool": bool,
			
 
				-                                             "int": int,
			
 
				-                                             "long": int,
			
 
				-                                             "date": np.dtype('<M8[ns]'),
			
 
				-                                             "timestamp": np.dtype('<M8[ns]')
			
 
				-                                             }
			
 
				+        bson_to_python_types = {"double": float,
			
 
				+                                "decimal": float,
			
 
				+                                "string": str,
			
 
				+                                "object": object,
			
 
				+                                "array": list,
			
 
				+                                "bool": bool,
			
 
				+                                "int": int,
			
 
				+                                "long": int,
			
 
				+                                "date": np.dtype('<M8[ns]'),
			
 
				+                                "timestamp": np.dtype('<M8[ns]')
			
 
				+                                }
			
 
				 
			
 
				         for k, v in mongo_types.items():
			
 
				 
			
@@ -110,8 +126,8 @@ class ParseJsonSchema(ParseDbSchema):
 
				                     self._log.error(err)
			
 
				                     raise Exception(err)
			
 
				 
			
 
				-            if v in bson_to_python_types_except_dates:
			
 
				-                python_types[k] = bson_to_python_types_except_dates[v]
			
 
				+            if v in bson_to_python_types:
			
 
				+                python_types[k] = bson_to_python_types[v]
			
 
				 
			
 
				         return python_types
			
 
				 
			
@@ -157,14 +173,18 @@ class ParseJsonSchema(ParseDbSchema):
 
				 
			
 
				     def _parse(self,
			
 
				                field_info: str = None,
			
 
				-               required_only: bool = False):
			
 
				+               required_only: bool = False,
			
 
				+               schemas: list = None):
			
 
				         '''
			
 
				         '''
			
 
				-        result = self._parse_one(schema=self.schemas[0],
			
 
				+        if schemas is None:
			
 
				+            schemas = self.schemas
			
 
				+
			
 
				+        result = self._parse_one(schema=schemas[0],
			
 
				                                  field_info=field_info,
			
 
				                                  required_only=required_only)
			
 
				 
			
 
				-        for schema in self.schemas[1:]:
			
 
				+        for schema in schemas[1:]:
			
 
				 
			
 
				             next_result = self._parse_one(schema=schema,
			
 
				                                           field_info=field_info,
			
@@ -238,6 +258,8 @@ class ParseJsonSchema(ParseDbSchema):
 
				         if "properties" in schema.keys():
			
 
				             if "required" in schema.keys():
			
 
				                 required_subfields = schema["required"]
			
 
				+            else:
			
 
				+                required_subfields = []
			
 
				 
			
 
				             for sub_field_name in schema["properties"].keys():
			
 
				 
			
--- a/cdplib/db_migration/ParseMapping.py
+++ b/cdplib/db_migration/ParseMapping.py
@@ -10,19 +10,20 @@ import os
 
				 import sys
			
 
				 import numpy as np
			
 
				 sys.path.append(os.getcwd())
			
 
				-
			
 
				+from libraries.log import Log
			
 
				 
			
 
				 class ParseMapping:
			
 
				     '''
			
 
				     '''
			
 
				     def __init__(self, mapping_path: str, log_name: str = "ParseMapping",
			
 
				-                 source: str = "original_name", target: str = "original_name"):
			
 
				+                 source: str = "original_name", target: str = "mongo_name",
			
 
				+                 target_collection: str = "mongo_collection"):
			
 
				         '''
			
 
				         '''
			
 
				         import json
			
 
				         from libraries.log import Log
			
 
				 
			
 
				-        self._log = Log(log_name)
			
 
				+        log = Log('Parse Mapping')
			
 
				 
			
 
				         if not os.path.isfile(mapping_path):
			
 
				             err = "Mapping not found"
			
@@ -41,6 +42,7 @@ class ParseMapping:
 
				 
			
 
				         self._source = source
			
 
				         self._target = target
			
 
				+        self._target_collection = target_collection
			
 
				 
			
 
				     def get_field_mapping(self) -> dict:
			
 
				         '''
			
@@ -71,6 +73,12 @@ class ParseMapping:
 
				         return self._get_fields_satistisfying_condition(key="type",
			
 
				                                                         value="Date")
			
 
				 
			
 
				+    def get_fields_restricted_to_collecton(self, collection_name: str) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_fields_satistisfying_condition(key=self._target_collection,
			
 
				+                                                        value=collection_name)
			
 
				+
			
 
				     def _get_info(self, key: str, value=None) -> dict:
			
 
				         '''
			
 
				         '''
			
--- a/cdplib/db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/ParseMapping.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/ParseMapping.cpython-37.pyc
--- a/cdplib/hyperopt/HyperoptPipelineSelection.py
+++ b/cdplib/hyperopt/HyperoptPipelineSelection.py
@@ -0,0 +1,798 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Nov  9 13:27:44 2018
			
 
				+
			
 
				+@author: tanja
			
 
				+@description: Implementation of machine learning
			
 
				+                pipeline selection and tuning with hyperopt library
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import gc
			
 
				+import logging
			
 
				+import pickle
			
 
				+import time
			
 
				+import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+
			
 
				+from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
			
 
				+    space_eval, pyll
			
 
				+
			
 
				+from sklearn.model_selection import cross_validate
			
 
				+
			
 
				+
			
 
				+class HyperoptPipelineSelection:
			
 
				+    '''
			
 
				+    Use this class to perform a search
			
 
				+    for a machine learning pipeline in a given parameter space.
			
 
				+    The parameter space can include multiple types of Pipelines
			
 
				+    (SVM, XGBOOST, random forest, etc),
			
 
				+    as well as parameter distributions for each pipeline parameter.
			
 
				+    See example in main for the expected space structure.
			
 
				+
			
 
				+    The search can be performed either randomly
			
 
				+    or with a tree-based algorithm. (Other methods are currently
			
 
				+    developped by hyperopt creators).
			
 
				+
			
 
				+    Attribute trials is responsible for book-keeping parameter
			
 
				+    combinations that have already been tried out. This attribute
			
 
				+    is saved to a binary file every n minutes as well as every time
			
 
				+    a better pipeline was found.
			
 
				+    '''
			
 
				+    def __init__(self,
			
 
				+                 cost_func,
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: int = 1,
			
 
				+                 log_path: str = None,
			
 
				+                 averaging_func: callable = None):
			
 
				+        '''
			
 
				+        :param callable cost_func: function to minimize or maximize
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from
			
 
				+            the beginning.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+
			
 
				+        :param str log_path: Optional, when not provided logs to stdout.
			
 
				+
			
 
				+        :param callable averaging_func: optional,
			
 
				+            when not provided set to mean. Function
			
 
				+            to aggregate the cross-validated values of the cost function.
			
 
				+            Classic situation is to take the mean,
			
 
				+            another example is, for example mean() - c*var().
			
 
				+        '''
			
 
				+
			
 
				+        assert(callable(cost_func)),\
			
 
				+            "Parameter 'cost_func' must be a callable"
			
 
				+
			
 
				+        assert(isinstance(greater_is_better, bool)),\
			
 
				+            "Parameter 'greater_is_better' must be bool type"
			
 
				+
			
 
				+        assert(isinstance(trials_path, str)),\
			
 
				+            "Parameter 'trials_path' must be of string type"
			
 
				+
			
 
				+        if averaging_func is not None:
			
 
				+            assert(callable(averaging_func)),\
			
 
				+                "Parameter 'averaging_func' must be a callable"
			
 
				+
			
 
				+        self._assert_valid_directory(path=trials_path)
			
 
				+
			
 
				+        self._configer_logger(log_path)
			
 
				+
			
 
				+        self._cost_func = cost_func
			
 
				+        # is 1 when cost_func is minimized, -1 when cost func is maximized
			
 
				+        self._score_factor = (not greater_is_better) - greater_is_better
			
 
				+        self._trials_path = trials_path
			
 
				+        # is initialized with empty trials object
			
 
				+        self._trials = Trials()
			
 
				+        self._backup_trials_freq = backup_trials_freq
			
 
				+        self._averaging_func = averaging_func or np.mean
			
 
				+        # keeping track of the current search iteration
			
 
				+        self._run_number = 0
			
 
				+        # space and data need to be attached to perform search.
			
 
				+        self._space_attached = False
			
 
				+        self._data_attached = False
			
 
				+
			
 
				+        # if a trials object already exists at the given path,
			
 
				+        # it is loaded and the search is continued. Else,
			
 
				+        # the search is started from the beginning.
			
 
				+        if os.path.isfile(trials_path):
			
 
				+            try:
			
 
				+                with open(trials_path, "rb") as f:
			
 
				+                    self._trials = pickle.load(f)
			
 
				+
			
 
				+                self._logger.info(("Loaded an existing trials object"
			
 
				+                                   "Consisting of {} trials")
			
 
				+                                  .format(len(self._trials.trials)))
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                self._logger.error(("Trials object could not be loaded. "
			
 
				+                                    "Training starts from the beginning. "
			
 
				+                                    "Exit with error {}").format(e))
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.info(("No existing trials object was found"
			
 
				+                               "Initialized an empty trials object."))
			
 
				+
			
 
				+        self._best_score = self.best_trial_score
			
 
				+
			
 
				+    def _configer_logger(self, log_path: str = None):
			
 
				+        '''
			
 
				+        Can be replaced with the existing script later.
			
 
				+        When log_path is not provided, logs to stdout.
			
 
				+        '''
			
 
				+
			
 
				+        self._logger = logging.getLogger(__name__)
			
 
				+
			
 
				+        if (self._logger.hasHandlers()):
			
 
				+            self._logger.handlers.clear()
			
 
				+
			
 
				+        if log_path is not None:
			
 
				+            assert(isinstance(log_path, str)),\
			
 
				+                "Parameter 'log_path' must be of string type"
			
 
				+            self._assert_valid_directory(log_path)
			
 
				+
			
 
				+            handler = logging.FileHandler(log_path)
			
 
				+        else:
			
 
				+            handler = logging.StreamHandler(sys.stdout)
			
 
				+
			
 
				+        formatter = logging.Formatter(
			
 
				+                '\n %(asctime)s %(levelname)s %(message)s')
			
 
				+
			
 
				+        handler.setFormatter(formatter)
			
 
				+        self._logger.addHandler(handler)
			
 
				+        self._logger.setLevel("INFO")
			
 
				+
			
 
				+    def _backup_trials(self):
			
 
				+        '''
			
 
				+        Pickles (Saves) the trials object.
			
 
				+        Used in a scheduler.
			
 
				+        '''
			
 
				+        with open(self._trials_path, "wb") as f:
			
 
				+            pickle.dump(self._trials, f)
			
 
				+
			
 
				+    def _assert_valid_directory(self, path: str):
			
 
				+        '''
			
 
				+        If the directory of a path does not exist yet,
			
 
				+        creates it.
			
 
				+        '''
			
 
				+        assert(isinstance(path, str)),\
			
 
				+            "Parameter 'path' must of str type"
			
 
				+
			
 
				+        dirname = os.path.dirname("path")
			
 
				+
			
 
				+        if len(dirname) > 0:
			
 
				+            os.mkdir(dirname, exists_ok=True)
			
 
				+
			
 
				+    def attach_space(self, space: pyll.base.Apply = None,
			
 
				+                     module_path: str = None,
			
 
				+                     name: str = None):
			
 
				+        '''
			
 
				+        :param pyll.base.Apply space: hyperopt space where
			
 
				+            the search is performed. Optional when a space
			
 
				+            is loaded from a python module.
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the space is defined. Optional when
			
 
				+            the space is provided directly.
			
 
				+
			
 
				+        :param str name: name of the space loaded from
			
 
				+            a python module. Optional when the space
			
 
				+            is provided directly.
			
 
				+        '''
			
 
				+        assert((space is not None) or
			
 
				+               ((module_path is not None) and (name is not None))),\
			
 
				+            "Either space or (module_path, name) must be provided"
			
 
				+
			
 
				+        if space is None:
			
 
				+            for p in ["modele_path", "name"]:
			
 
				+                assert(isinstance(p, str)),\
			
 
				+                    "Parameter '{}' must be of str type".format(p)
			
 
				+
			
 
				+            assert(os.path.isfile(module_path)),\
			
 
				+                "Parameter 'module_path' must be a valid file"
			
 
				+
			
 
				+            module, extension = os.path.splitext(os.path.basename(module_path))
			
 
				+            assert(extension == ",py"),\
			
 
				+                "Parameter 'space' must be read from a python file"
			
 
				+
			
 
				+            sys.path.insert(module_path)
			
 
				+
			
 
				+            try:
			
 
				+                from module import name as space
			
 
				+            except ImportError:
			
 
				+                err = "Invalid space location or name"
			
 
				+                self._logger.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+        assert(isinstance(space, pyll.base.Apply)),\
			
 
				+            "Parameter 'space' must be of hyperopt space type"
			
 
				+
			
 
				+        self._space = space
			
 
				+        self._logger.info("Attached parameter distribution space")
			
 
				+        self._space_attached = True
			
 
				+
			
 
				+    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
			
 
				+            -> np.ndarray:
			
 
				+        '''
			
 
				+        Converts an DataFrame to an numpy array.
			
 
				+        '''
			
 
				+        if isinstance(x, np.ndarray):
			
 
				+            return x
			
 
				+
			
 
				+        elif (isinstance(x, pd.core.frame.DataFrame))\
			
 
				+                or (isinstance(x, pd.core.series.Series)):
			
 
				+            return x.values
			
 
				+
			
 
				+        else:
			
 
				+            e = 'The argument must be a numpy array or a pandas DataFrame'
			
 
				+            self._logger.critical(e)
			
 
				+            raise ValueError(e)
			
 
				+
			
 
				+    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
			
 
				+                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				+                    X_val: (pd.DataFrame, np.ndarray) = None,
			
 
				+                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				+                    cv: (list, int) = None):
			
 
				+        '''
			
 
				+        :param array X_train: data on which
			
 
				+            machine learning pipelines are trained
			
 
				+
			
 
				+        :param array y_train: optional, vector with targets,
			
 
				+            (not all algorithms require a targets)
			
 
				+
			
 
				+        :param array X_val: optional, validation data.
			
 
				+            When not provided, cross-validated value
			
 
				+            of the cost_func is calculated.
			
 
				+
			
 
				+        :param array y_val: optional, validation targets
			
 
				+
			
 
				+        :param list cv: list of tuples containing
			
 
				+            train and validation indices or an integer representing
			
 
				+            the number of folds for a random split of data
			
 
				+            during cross-validation
			
 
				+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
			
 
				+        '''
			
 
				+
			
 
				+        X_train = self._convert_to_array(X_train)
			
 
				+        if y_train is not None:
			
 
				+            y_train = self._convert_to_array(y_train)
			
 
				+
			
 
				+        if X_val is not None:
			
 
				+            if cv is not None:
			
 
				+                self._logger.warning(("Both validation set and cv object "
			
 
				+                                      "are set. Validation score will be "
			
 
				+                                      "calculated on the validation set!"))
			
 
				+
			
 
				+            X_val = self._convert_to_array(X_val)
			
 
				+
			
 
				+            train_inds = list(range(len(X_train)))
			
 
				+            val_inds = list(range(len(X_train),
			
 
				+                                  len(X_train) + len(X_val)))
			
 
				+
			
 
				+            # cost is evaluated with a cross validation function
			
 
				+            # that accepts an array and a cv object with
			
 
				+            # indices of the fold splits.
			
 
				+            # Here we create a trivial cv object
			
 
				+            # with one validation split.
			
 
				+            self._cv = [(train_inds, val_inds)]
			
 
				+            self._X = np.concatenate([X_train, X_val])
			
 
				+
			
 
				+            if y_train is not None:
			
 
				+                if y_val is None:
			
 
				+                    err = "Argument y_val must be provided"
			
 
				+                    self._logger.critical(err)
			
 
				+                    raise ValueError(err)
			
 
				+                else:
			
 
				+                    y_val = self._convert_to_array(y_val)
			
 
				+                    self._y = np.concatenate([y_train, y_val])
			
 
				+            else:
			
 
				+                self._y = None
			
 
				+        else:
			
 
				+            if cv is None:
			
 
				+                self._logger.warning(("Neither validation set nor cv object "
			
 
				+                                      "are set. Validation score will be "
			
 
				+                                      "calculated on 5 randomly "
			
 
				+                                      "splitted folds."))
			
 
				+
			
 
				+            self._X = X_train
			
 
				+            self._y = y_train
			
 
				+            self._cv = cv
			
 
				+
			
 
				+        self._logger.info("Attached data")
			
 
				+        self._data_attached = True
			
 
				+
			
 
				+    def _evaluate(self, pipeline: Pipeline) -> dict:
			
 
				+        '''
			
 
				+        This method is called in _objective.
			
 
				+
			
 
				+        Calculates the cost on the attached data.
			
 
				+        This function can be overriden, when the cost
			
 
				+        needs to be calculated differently,
			
 
				+        for example with a tensorflow model.
			
 
				+
			
 
				+        :param Pipeline pipeline: machine learning pipeline
			
 
				+            that will be evaluated with cross-validation
			
 
				+
			
 
				+        :output: dictionary with the aggregated
			
 
				+            cross-validation score and
			
 
				+            the score variance.
			
 
				+        '''
			
 
				+
			
 
				+        scores = cross_validate(estimator=pipeline,
			
 
				+                                X=self._X,
			
 
				+                                y=self._y,
			
 
				+                                cv=self._cv or 5,
			
 
				+                                scoring=make_scorer(self._cost_func),
			
 
				+                                error_score=np.nan)
			
 
				+
			
 
				+        return {'value': self._averaging_func(scores['test_score']),
			
 
				+                'variance': np.var(scores['test_score'])}
			
 
				+
			
 
				+    def _objective(self, space_element: dict) -> dict:
			
 
				+        '''
			
 
				+        This method is called in search_for_best_pipeline
			
 
				+        inside the hyperopt fmin method.
			
 
				+
			
 
				+        Uses _evaluate method.
			
 
				+
			
 
				+        It must take as input a space element
			
 
				+        and produce an output in the form of dictionary
			
 
				+        with 2 obligatory values loss and status
			
 
				+        (STATUS_OK or STATUS_FAIL). Other
			
 
				+        values in the output are optional and can be
			
 
				+        accessed later through the trials object.
			
 
				+
			
 
				+        :Warning: fmin minimizes the loss,
			
 
				+        when _evaluate returns a value to be maximized,
			
 
				+        it should be multiplied by -1 to obtain loss.
			
 
				+
			
 
				+        :param dict space_element: must contain keys
			
 
				+            name (with the name of the pipeline),
			
 
				+            pipeline (Pipeline object),
			
 
				+            params (dict of pipeline params)
			
 
				+
			
 
				+        :output: dictionary with keys
			
 
				+            loss (minimized value),
			
 
				+            status with values STATUS_OK or STATUS_FAIL
			
 
				+            uderstood by hyperopt,
			
 
				+            score (equal to loss or -loss),
			
 
				+            score_variance,
			
 
				+            timestamp (end of execution),
			
 
				+            train_time: execution time
			
 
				+        '''
			
 
				+        assert(isinstance(space_element, dict) and
			
 
				+               set(['name', 'pipeline', 'params']) <= space_element.keys())
			
 
				+
			
 
				+        assert(isinstance(space_element['name'], str) and
			
 
				+               isinstance(space_element['pipeline'], Pipeline) and
			
 
				+               isinstance(space_element['params'], dict))
			
 
				+
			
 
				+        start_time = time.time()
			
 
				+
			
 
				+        if not self._data_attached:
			
 
				+            raise Exception(("Data must be attached in order "
			
 
				+                             "in order to effectuate the best"
			
 
				+                             "pipeline search"))
			
 
				+
			
 
				+        self._run_number += 1
			
 
				+
			
 
				+        pipeline = space_element['pipeline']
			
 
				+        params = space_element['params']
			
 
				+        pipeline.set_params(**params)
			
 
				+
			
 
				+        self._logger.info(("Run number {0}: "
			
 
				+                           "Current score is {1}: "
			
 
				+                           "Training pipeline {2} "
			
 
				+                           "with parameters: {3}. ").format(
			
 
				+                             self._run_number,
			
 
				+                             self._best_score,
			
 
				+                             space_element['name'],
			
 
				+                             params))
			
 
				+
			
 
				+        try:
			
 
				+            score_stats = self._evaluate(pipeline)
			
 
				+            assert(not np.isnan(score_stats["value"])),\
			
 
				+                "Returned null score"
			
 
				+
			
 
				+            if self._run_number % self._backup_trials_freq == 0:
			
 
				+                self._backup_trials()
			
 
				+
			
 
				+            if (self._best_score != self._best_score) or\
			
 
				+                self._score_factor*score_stats["value"] <\
			
 
				+                    self._score_factor*self._best_score:
			
 
				+
			
 
				+                self._logger.info("Score got better, new best score is: {}"
			
 
				+                                  .format(score_stats["value"]))
			
 
				+
			
 
				+                self._best_score = score_stats['value']
			
 
				+
			
 
				+                self._backup_trials()
			
 
				+
			
 
				+            end_time = time.time()
			
 
				+
			
 
				+            return {'loss': self._score_factor * score_stats["value"],
			
 
				+                    'status': STATUS_OK,
			
 
				+                    'score': score_stats["value"],
			
 
				+                    'score_variance': score_stats["variance"],
			
 
				+                    'timestamp': datetime.datetime.today(),
			
 
				+                    'train_time': end_time - start_time}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+
			
 
				+            self._logger.warning("Trial failed with error {}".format(e))
			
 
				+
			
 
				+            return {'loss': np.nan,
			
 
				+                    'status': STATUS_FAIL,
			
 
				+                    'score': np.nan,
			
 
				+                    'score_variance': np.nan,
			
 
				+                    'timestamp': datetime.datetime.today(),
			
 
				+                    'train_time': np.nan}
			
 
				+
			
 
				+    def search_for_best_pipeline(self,
			
 
				+                                 niter: int,
			
 
				+                                 algo: callable = tpe.suggest):
			
 
				+        '''
			
 
				+        Method performing the search of the best pipeline in the given space.
			
 
				+        Calls fmin function from the hyperopt library to minimize the output of
			
 
				+        _objective.
			
 
				+
			
 
				+        :params int niter: number of search iterations
			
 
				+        :param callable algo: now can only take values tpe for a tree-based
			
 
				+            random search or random for random search
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        assert(isinstance(niter, int)),\
			
 
				+            "Parameter 'niter' must be of int type"
			
 
				+
			
 
				+        # right now only two algorithms are provided by
			
 
				+        assert(algo in [tpe.suggest, rand.suggest]),\
			
 
				+            ("Parameter 'algo' can be now only tpe or random. "
			
 
				+             "If other algorithms have been developped by "
			
 
				+             "by hyperopt, plased add them to the list.")
			
 
				+
			
 
				+        try:
			
 
				+            self._logger.info(("Starting {0} iterations of search "
			
 
				+                               "additional to {1} previous"
			
 
				+                               .format(niter, len(self._trials.trials))))
			
 
				+
			
 
				+            best = fmin(fn=self._objective,
			
 
				+                        space=space,
			
 
				+                        algo=algo,
			
 
				+                        trials=self._trials,
			
 
				+                        max_evals=len(self._trials.trials) + niter)
			
 
				+
			
 
				+            # print('AAAA', str(niter))
			
 
				+
			
 
				+            self._logger.info(
			
 
				+                    "Best score is {0} with variance {1}"
			
 
				+                    .format(
			
 
				+                     self._trials.best_trial["result"]["score"],
			
 
				+                     self._trials.best_trial["result"]["score_variance"]))
			
 
				+
			
 
				+            self._logger.info(("Finished {0} iterations of search.\n"
			
 
				+                               "Best parameters are:\n {1} ")
			
 
				+                              .format(niter,
			
 
				+                                      space_eval(space, best)))
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            raise ValueError(("Failed to select best "
			
 
				+                             "pipeline! Exit with error: {}").format(e))
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            return self._trials.best_trial["result"]["score"]
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            return self._trials.best_trial["result"]["score_variance"]
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+
			
 
				+            return space_eval(
			
 
				+                    space,
			
 
				+                    {k: v[0] for k, v in
			
 
				+                     self._trials.best_trial['misc']['vals'].items()
			
 
				+                     if len(v) > 0})["pipeline"]
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def _ith_trial_loss(self, i: int) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return self._trials.trials[i]['result']['loss']
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    def _ith_trial_element(self, i: int, name: str) -> object:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return space_eval(self._space,
			
 
				+                              {k: v[0] for k, v in
			
 
				+                               self._trials.trials[i]['misc']['vals']
			
 
				+                               .items() if len(v) > 0})[name]
			
 
				+
			
 
				+    def _ith_trial_pipeline(self, i: int) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='pipeline')
			
 
				+
			
 
				+    def _ith_trial_name(self, i: int) -> str:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='name')
			
 
				+
			
 
				+    def _ith_trial_params(self, i: int) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='params')
			
 
				+
			
 
				+    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return self._trials.trials[i]["result"]["timestamp"]
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
			
 
				+        '''
			
 
				+        Returns the list of n best pipelines
			
 
				+        documented in trials
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            if losses is None:
			
 
				+                losses = [self._ith_trial_loss(i)
			
 
				+                          for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+            best_n_indices = [losses.index(l)
			
 
				+                              for l in sorted(list(set(losses)))[:n]]
			
 
				+
			
 
				+            return [self._ith_trial_pipeline(i) for i in best_n_indices]
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
			
 
				+        '''
			
 
				+        Returns a dictiionry where keys are pipeline names,
			
 
				+        and values are lists of best pipelines with this name
			
 
				+        '''
			
 
				+        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
			
 
				+
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+
			
 
				+            best_pipelines_per_type = {}
			
 
				+            names = [self._ith_trial_name(i)
			
 
				+                     for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+            for nm in names:
			
 
				+                losses = [self._ith_trial_loss(i)
			
 
				+                          for i in range(len(self._trials.trials))
			
 
				+                          if self._ith_trial_name(i) == nm]
			
 
				+
			
 
				+                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
			
 
				+                                                        n=n,
			
 
				+                                                        losses=losses)
			
 
				+
			
 
				+            return best_pipelines_per_type
			
 
				+
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def write_trials_documentation(self, path: str = None):
			
 
				+        '''
			
 
				+        Saves an excel file with pipeline names, scores,
			
 
				+        parameters, and timestamps.
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            path = path or "hyperopt_trials_documentation.xlsx"
			
 
				+
			
 
				+            assert(isinstance(path, str)),\
			
 
				+                "Parameter 'path' must be of string type"
			
 
				+
			
 
				+            self._assert_valid_directory(path)
			
 
				+
			
 
				+            names = [self._ith_trial_name(i)
			
 
				+                     for i in range(len(self._trials.trials))]
			
 
				+            scores = [self._score_factor*self._ith_trial_loss(i)
			
 
				+                      for i in range(len(self._trials.trials))]
			
 
				+            params = [self._ith_trial_params(i)
			
 
				+                      for i in range(len(self._trials.trials))]
			
 
				+            timestamps = [self._ith_trial_timestamp(i)
			
 
				+                          for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+        else:
			
 
				+            names = []
			
 
				+            scores = []
			
 
				+            params = []
			
 
				+            timestamps = []
			
 
				+
			
 
				+        pd.DataFrame({"name": names,
			
 
				+                      "score": scores,
			
 
				+                      "params": params,
			
 
				+                      "timestamp": timestamps})\
			
 
				+          .to_excel(path)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    from sklearn.metrics import roc_auc_score, make_scorer
			
 
				+    from xgboost import XGBClassifier
			
 
				+    from sklearn.svm import SVC
			
 
				+    from sklearn.feature_selection import SelectKBest
			
 
				+    from sklearn.decomposition import PCA
			
 
				+    from sklearn.datasets import load_iris
			
 
				+    from pprint import pprint
			
 
				+
			
 
				+    data = load_iris()
			
 
				+    X = pd.DataFrame(data.data)
			
 
				+    y = pd.Series(data.target)
			
 
				+    # produce a binory variable
			
 
				+    y = (y == 2).astype(int)
			
 
				+    del data
			
 
				+    gc.collect()
			
 
				+
			
 
				+    # SPACE DEFINITION ########################################
			
 
				+    # (can be moved to a separate python script)
			
 
				+
			
 
				+    """
			
 
				+    A search space must be a list of dictionaries.
			
 
				+    Each dictionry must have keys:
			
 
				+        name (pipeline name or type),
			
 
				+        pipeline (instance of sklearn.pipeline.Pipeline),
			
 
				+        params (dictionary of distributions for the parameters of
			
 
				+                the pipeline that we want to tune)
			
 
				+
			
 
				+    Here we have a space that consists of two dictionaries:
			
 
				+    KBEST_XGBOOST and PCA_SVC
			
 
				+    """
			
 
				+    space = []
			
 
				+
			
 
				+    pipeline_dist_1 = {}
			
 
				+    pipeline_dist_1["name"] = "KBEST_XGBOOST"
			
 
				+
			
 
				+    """
			
 
				+    A pipeline consists of steps (tuples).
			
 
				+    Each step has a name and an algorithm.
			
 
				+    This pipeline, as a first step performs
			
 
				+    feature selection with SelectKBest and
			
 
				+    as a second step evaluates a machine learning algo (xgboost).
			
 
				+
			
 
				+    Like all sklearn algorithms, a Pipeline has methods
			
 
				+    fit, predict, set_params, get_params
			
 
				+    """
			
 
				+    pipeline_dist_1["pipeline"] = Pipeline([
			
 
				+                                     ('kbest', SelectKBest()),
			
 
				+                                     ('xgb', XGBClassifier())
			
 
				+                                     ])
			
 
				+    """
			
 
				+    Pipeline parameter dictionaries must be of the form:
			
 
				+    {'kbest__k': 3, xgb__n_estimators: 20},
			
 
				+    each parameter name consists of the step name, __, and parameter name.
			
 
				+
			
 
				+    Here, instead of values, the parameter names are followed
			
 
				+    by hyperopt distributions.
			
 
				+    Each hyperopt distribution also must have a name,
			
 
				+    due to hyperopt functionality.
			
 
				+
			
 
				+    Here, we set the hyperopt distribution name to the step name,
			
 
				+    but it does not have to be so. Hyperopt distribution names
			
 
				+    must be different for different elements of the space.
			
 
				+    """
			
 
				+
			
 
				+    pipeline_dist_1["params"] = {
			
 
				+            'kbest__k': hp.choice('kbest__k', range(1, 5)),
			
 
				+
			
 
				+            'xgb__n_estimators':
			
 
				+            50 + hp.randint('xgb__n_estimators', 50),
			
 
				+
			
 
				+            "xgb__learning_rate":
			
 
				+            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
			
 
				+            }
			
 
				+
			
 
				+    space.append(pipeline_dist_1)
			
 
				+
			
 
				+    pipeline_dist_2 = {}
			
 
				+    pipeline_dist_2["name"] = "PCA_SVC"
			
 
				+
			
 
				+    pipeline_dist_2["pipeline"] = Pipeline([
			
 
				+                                     ('pca', PCA()),
			
 
				+                                     ('svc', SVC(gamma="scale"))
			
 
				+                                     ])
			
 
				+
			
 
				+    pipeline_dist_2["params"] = {
			
 
				+            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
			
 
				+
			
 
				+            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
			
 
				+            }
			
 
				+
			
 
				+    space.append(pipeline_dist_2)
			
 
				+
			
 
				+    space = hp.choice('pipelines', space)
			
 
				+
			
 
				+    # TESTING ##########################################################
			
 
				+
			
 
				+    trials_path = 'TEST_hyperopt_trials.pkl'
			
 
				+
			
 
				+    doc_path = 'TEST_hyperopt_doc.xlsx'
			
 
				+
			
 
				+    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
			
 
				+                                       greater_is_better=True,
			
 
				+                                       trials_path=trials_path)
			
 
				+
			
 
				+    hp_obj.attach_data(X_train=X, y_train=y)
			
 
				+
			
 
				+    hp_obj.attach_space(space=space)
			
 
				+
			
 
				+    hp_obj.search_for_best_pipeline(niter=10)
			
 
				+
			
 
				+    print('\n', '='*20, 'TESTING', '='*20)
			
 
				+
			
 
				+    print('\n', 'Best score:', hp_obj.best_trial_score)
			
 
				+
			
 
				+    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
			
 
				+
			
 
				+    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
			
 
				+
			
 
				+    print('\n', 'Best 3 pipelines: \n')
			
 
				+    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
			
 
				+
			
 
				+    print('\n', 'Best pipeline per type: \n')
			
 
				+    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
			
 
				+
			
 
				+    hp_obj.write_trials_documentation(path=doc_path)
			
 
				+
			
 
				+    # os.remove(doc_path)
			
 
				+    # os.remove(trials_path)
			
--- a/cdplib/log.py
+++ b/cdplib/log.py
@@ -6,12 +6,19 @@
 
				 import sys
			
 
				 import os
			
 
				 import logging
			
 
				+from datetime import datetime
			
 
				 
			
 
				+sys.path.append(os.getcwd())
			
 
				 
			
 
				-class Log:
			
 
				+
			
 
				+class Log():
			
 
				+    '''
			
 
				+    '''
			
 
				+    pass
			
 
				     def __init__(self, name: str = None,
			
 
				                  log_file: str = None,
			
 
				-                 log_level: str = "INFO",
			
 
				+                 log_level: str = "ERROR",
			
 
				+                 stdout_log_level: str = "INFO",
			
 
				                  print_to_stdout: bool = True):
			
 
				         """Sets the log level and the path where the log file is stored
			
 
				 
			
@@ -23,30 +30,34 @@ class Log:
 
				 
			
 
				         self._logger = logging.getLogger(name)
			
 
				 
			
 
				+        self._logger.setLevel("DEBUG")
			
 
				+
			
 
				         if (self._logger.hasHandlers()):
			
 
				             self._logger.handlers.clear()
			
 
				 
			
 
				         if log_file is None:
			
 
				-            log_file = os.path.join(".", "all.log")
			
 
				+            log_file = os.path.join(".", "logs", str(datetime.today().date()) + ".log")
			
 
				 
			
 
				         assert(isinstance(log_file, str)),\
			
 
				             "Parameter 'log_path' must be of string type"
			
 
				 
			
 
				+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
			
 
				+
			
 
				         formatter = logging.Formatter(
			
 
				                 '\n %(name)s %(asctime)s %(levelname)s %(message)s')
			
 
				 
			
 
				-        os.makedirs(os.path.dirname(log_file), exist_ok=True)
			
 
				-
			
 
				         file_handler = logging.FileHandler(log_file)
			
 
				         file_handler.setFormatter(formatter)
			
 
				+        file_handler.setLevel(log_level)
			
 
				         self._logger.addHandler(file_handler)
			
 
				 
			
 
				         if print_to_stdout:
			
 
				             stream_handler = logging.StreamHandler(sys.stdout)
			
 
				             stream_handler.setFormatter(formatter)
			
 
				+            stream_handler.setLevel(stdout_log_level)
			
 
				             self._logger.addHandler(stream_handler)
			
 
				 
			
 
				-        self._logger.setLevel(log_level)
			
 
				+        # self._logger.setLevel(log_level)
			
 
				 
			
 
				     def info(self, message: str):
			
 
				         self._logger.info(message)
			
@@ -56,3 +67,24 @@ class Log:
 
				 
			
 
				     def error(self, message: str):
			
 
				         self._logger.error(message)
			
 
				+
			
 
				+    def log_and_raise_error(self, message):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._logger.error(message, exc_info=True)
			
 
				+
			
 
				+        raise Exception(message)
			
 
				+
			
 
				+    def log_and_raise_error_stack_info(self, message):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._logger.error(message, exc_info=True, stack_info=True)
			
 
				+
			
 
				+        raise Exception(message)
			
 
				+
			
 
				+    def log_and_raise_warning(self, message):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._loggger.warning(message)
			
 
				+
			
 
				+        raise Warning(message)