лет назад: 5 · 03ba54fdbd
--- a/cdplib/__pycache__/__init__.cpython-37.pyc
+++ b/cdplib/__pycache__/__init__.cpython-37.pyc
--- a/cdplib/feature_engineering/StatisticalFeatures.py
+++ b/cdplib/feature_engineering/StatisticalFeatures.py
@@ -0,0 +1,270 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+""" 
			
 
				+Created on Tue Oct 16 16:08:47 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+import types
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+
			
 
				+from collections import defaultdict
			
 
				+from functools import reduce
			
 
				+
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+from libraries.exception_handling import InputChecks
			
 
				+          
			
 
				+class StatisticalFeatures:
			
 
				+    '''
			
 
				+    Groups data by index columns and returns aggregated statistics for given columns
			
 
				+    
			
 
				+    :param list of tuples or dict index_cols: 
			
 
				+        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
			
 
				+                                             (colname_2, aggfunc_3)]
			
 
				+        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
			
 
				+        where colname_i is column to aggregate and aggfunc_i are either 
			
 
				+        function variables or strings accepted by pandas for built-in function names.
			
 
				+        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
			
 
				+        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
			
 
				+        then only the first one is kept.
			
 
				+        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
			
 
				+        
			
 
				+    '''
			
 
				+    def __init__(self, data, index_cols, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        configure_logging(path_to_log)
			
 
				+            
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        
			
 
				+        self.checks = InputChecks(logger = self.logger)
			
 
				+        
			
 
				+        self.data = data
			
 
				+        
			
 
				+        self.checks.assert_correct_type({'data', [pd.DataFrame]})
			
 
				+            
			
 
				+        self.index_cols = index_cols
			
 
				+        
			
 
				+        # make warning about missing values in index columns
			
 
				+        for col in self.index_cols:
			
 
				+            if data[col].isnull().any():
			
 
				+                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
			
 
				+
			
 
				+        
			
 
				+    def get_kpis_by_aggregation(self, kpis):
			
 
				+        '''
			
 
				+        Aggregates given fields with given aggregation functions
			
 
				+         USE CASE: per product find mean and standard variation of a price
			
 
				+        
			
 
				+        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
			
 
				+         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
			
 
				+         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
			
 
				+         
			
 
				+        :return: features with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        def get_valid_agg_dict_from_kpis(kpis):
			
 
				+            '''
			
 
				+            Filters inputs of incorrect shape or type,
			
 
				+            Filters out columns not present in data
			
 
				+            Removes multiple functions with the same name
			
 
				+            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
			
 
				+            Reports to the log
			
 
				+            :param list or dict kpis:
			
 
				+            '''
			
 
				+            def get_name(x):
			
 
				+                '''
			
 
				+                Returns function name for function and does nothing for string
			
 
				+                '''
			
 
				+                if isinstance(x, types.FunctionType):
			
 
				+                    return x.__name__
			
 
				+                else:
			
 
				+                    return x
			
 
				+                
			
 
				+            def passed_first_line_type_control(col, aggfunc):
			
 
				+                '''
			
 
				+                Checks if aggregation works on the first 2 lines of the data
			
 
				+                '''
			
 
				+                try:
			
 
				+                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
			
 
				+                    self.data.iloc[:2]\
			
 
				+                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
			
 
				+                             .groupby(self.index_cols)\
			
 
				+                             .agg({col : aggfunc})
			
 
				+                    return True
			
 
				+                except Exception as e:
			
 
				+                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
			
 
				+                    return False
			
 
				+           
			
 
				+            
			
 
				+            
			
 
				+            valid_kpi_dict = defaultdict(list)
			
 
				+            
			
 
				+            if isinstance(kpis, list):
			
 
				+                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
			
 
				+                if sum(incorrect_lengths) > 0:
			
 
				+                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
			
 
				+                
			
 
				+                cols = list(zip(*kpis))[0]             
			
 
				+                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
			
 
				+            elif isinstance(kpis, dict):
			
 
				+                cols = list(kpis.keys())
			
 
				+                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
			
 
				+                
			
 
				+            cols_not_in_data = set(cols) - set(self.data.columns)
			
 
				+            if len(cols_not_in_data) > 0:
			
 
				+                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
			
 
				+                
			
 
				+            for col, aggfuncs in kpis:
			
 
				+                if not isinstance(aggfuncs, list):
			
 
				+                    aggfuncs = [aggfuncs]
			
 
				+                
			
 
				+                for aggfunc in aggfuncs:
			
 
				+                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
			
 
				+                    if not is_new_funcname:
			
 
				+                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
			
 
				+                    
			
 
				+                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
			
 
				+                        valid_kpi_dict[col].append(aggfunc)
			
 
				+                    
			
 
				+            return valid_kpi_dict
			
 
				+                   
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        agg_dict = get_valid_agg_dict_from_kpis(kpis)
			
 
				+        
			
 
				+        if len(agg_dict) > 0:
			
 
				+        
			
 
				+            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
			
 
				+                             else '_'.join([col, str(aggfunc)]) 
			
 
				+                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
			
 
				+            
			
 
				+            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
			
 
				+            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
			
 
				+                       .groupby(self.index_cols)\
			
 
				+                       .agg(agg_dict)\
			
 
				+                       .set_axis(new_names, axis = 'columns', inplace = False)\
			
 
				+                       .reset_index()
			
 
				+        else:
			
 
				+            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
			
 
				+        '''
			
 
				+        A wrapper crosstab method with index equal to index_cols
			
 
				+        USE CASE: per product find standart variation of the price in each city
			
 
				+        
			
 
				+        :param str pivot_col: column values of which become columns in the output
			
 
				+        :param str value_col: column name to fillin vlaues
			
 
				+        :param str or func aggfunc: count if None
			
 
				+        :param list entries: values of pivot_col to show
			
 
				+        :return: table with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        
			
 
				+        # assert that types of the inputs are correct
			
 
				+        types_to_check = {'columns' : [str], 
			
 
				+                          'value_col' : [str, type(None)],  
			
 
				+                          'aggfunc' : ['str', types.FunctionType, type(None)], 
			
 
				+                          'entries' : [list, type(None)]}
			
 
				+        
			
 
				+        self.checks.assert_correct_type(types_to_check)
			
 
				+        
			
 
				+        cols_to_check = [pivot_col]
			
 
				+        if not value_col is None:
			
 
				+            cols_to_check.append(value_col)
			
 
				+        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
			
 
				+
			
 
				+        if not entries is None:
			
 
				+            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
			
 
				+        else:
			
 
				+            entry_filter = pd.Series([True]*len(self.data))              
			
 
				+    
			
 
				+        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
			
 
				+        columns = self.data.loc[entry_filter, pivot_col]
			
 
				+        if not value_col is None:
			
 
				+            value_col = self.data.loc[entry_filter, value_col]
			
 
				+                        
			
 
				+        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
			
 
				+        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
			
 
				+                       .reset_index()
			
 
				+        return result
			
 
				+    
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+        
			
 
				+    
			
 
				+    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
			
 
				+        '''
			
 
				+        Aggregates values obtained with method get_value_stats
			
 
				+         USE CASE: per product find average variation of the price over all cities
			
 
				+         
			
 
				+        :param str pivot_col:
			
 
				+        :param str value_col:
			
 
				+        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
			
 
				+        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
			
 
				+        :param list entries: 
			
 
				+        :return: table with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
			
 
				+        
			
 
				+        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
			
 
				+
			
 
				+        result = value_stat_kpis[self.index_cols].copy(deep = True)
			
 
				+        
			
 
				+        for aggfunc in aggfuncs_step2:
			
 
				+            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
			
 
				+            
			
 
				+            if isinstance(aggfunc, str):
			
 
				+                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
			
 
				+            else:
			
 
				+                result[colname] = value_stat_kpis.set_index(self.index_cols)\
			
 
				+                                                 .apply(aggfunc, axis = 1)\
			
 
				+                                                 .reset_index(drop = True)
			
 
				+                                                 
			
 
				+        return result
			
 
				+                              
			
 
				+                              
			
 
				+                              
			
 
				+                              
			
 
				+                                                            
			
 
				+    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
			
 
				+        '''
			
 
				+        Finds argmin or argmax of a column
			
 
				+         USE CASE: per product find the city with maximum variation of the price
			
 
				+        
			
 
				+        :param str min_or_max: must be in ['min', 'max']
			
 
				+        :param str pivot_col:
			
 
				+        :param str value_col:
			
 
				+        :param str aggfunc:    
			
 
				+        '''
			
 
				+        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
			
 
				+        
			
 
				+        if min_or_max == 'max':
			
 
				+            aggfuncs_step2 = ['idxmax']
			
 
				+        else:
			
 
				+            aggfuncs_step2 = ['idxmin']
			
 
				+            
			
 
				+        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
			
 
				+                                                   value_col = value_col, 
			
 
				+                                                   aggfunc_step1 = aggfunc, 
			
 
				+                                                   aggfucs_step2 = aggfuncs_step2)
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
			
 
				+    
			
 
				+    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
			
 
				+    
			
 
				+    # write tests for all methods
			
--- a/cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
+++ b/cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
@@ -0,0 +1,77 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Nov  7 15:11:21 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
			
 
				+
			
 
				+
			
 
				+class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
			
 
				+    '''
			
 
				+    '''
			
 
				+    
			
 
				+    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
			
 
				+                                                                   index_cols = index_cols,
			
 
				+                                                                   date_col = date_col,
			
 
				+                                                                   split_date = split_date,
			
 
				+                                                                   period_length = n_periods*period_length,
			
 
				+                                                                   past_or_future = past_or_future,
			
 
				+                                                                   freq = freq,
			
 
				+                                                                   path_to_log)
			
 
				+        
			
 
				+        self.period_number_col = 'period_number'
			
 
				+        while period_number_col in data.columns:
			
 
				+            self.period_number_col += '&'
			
 
				+        
			
 
				+        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
			
 
				+                            .groupby(index_cols)[date_col].cumcount()\
			
 
				+                            .reset_index()\
			
 
				+                            .assign(period_number = lambda x: x[0]/period_length)\
			
 
				+                            .rename(columns = {'period_number' : self.period_number_col})
			
 
				+                                       
			
 
				+                
			
 
				+        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
			
 
				+                            
			
 
				+        self.initial_index_cols = self.index_cols.copy()
			
 
				+        self.index_cols.append(self.period_number_col)
			
 
				+        
			
 
				+        
			
 
				+    def _aggregate_over_time_periods(df):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return df.drop(self.period_number_col, axis = 1)\
			
 
				+                 .groupby(self.initial_index_cols)\
			
 
				+                 .mean()\
			
 
				+                 .reset_index()
			
 
				+        
			
 
				+        
			
 
				+    def get_kpis_by_aggregation(self, **args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                      .get_kpis_by_aggregation(**args))
			
 
				+            
			
 
				+            
			
 
				+    def get_value_stats(self, **args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                 .get_value_stats(**args))
			
 
				+        
			
 
				+        
			
 
				+    def get_aggregated_value_stats(self, args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                 .get_aggregated_value_stats(**args))
			
 
				+        
			
 
				+    
			
 
				+        
			
--- a/cdplib/feature_engineering/StatisticalFeaturesOverTime.py
+++ b/cdplib/feature_engineering/StatisticalFeaturesOverTime.py
@@ -0,0 +1,53 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Nov  7 14:02:18 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
			
 
				+from libraries.exception_handling import InputChecks, InputCasts
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+
			
 
				+class StatisticalFeaturesOverTime(StatisticalFeatures):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        configure_logging(path_to_log)
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        self.checks = InputChecks(logger = self.logger)
			
 
				+        self.casts = InputCasts(logger = self.logger)
			
 
				+        
			
 
				+        self.checks.assert_column_presence(data = data, colnames = [date_col])
			
 
				+        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
			
 
				+        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
			
 
				+        
			
 
				+        
			
 
				+        if past_or_future == 'past':
			
 
				+            if not period_length is None:
			
 
				+                min_date = split_date - pd.DateOffset(**{freq : period_length})
			
 
				+            else:
			
 
				+                min_date = data[date_col].min()
			
 
				+            sup_date = split_date
			
 
				+        else:
			
 
				+            min_date = split_date
			
 
				+            if not period_length is None:
			
 
				+                sup_date = split_date + pd.DateOffset(**{freq : period_length})
			
 
				+            else: 
			
 
				+                sup_date = split_date + pd.DateOffset(**{freq : 1})
			
 
				+            
			
 
				+        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
			
 
				+        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
			
 
				+        
			
 
				+            
			
 
				+        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
			
 
				+        
			
 
				+        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
			
 
				+                                                    index_cols = index_cols,
			
 
				+                                                    path_to_log = path_to_log)
			
--- a/cdplib/hyperopt/HyperoptPipelineSelection.py
+++ b/cdplib/hyperopt/HyperoptPipelineSelection.py
@@ -0,0 +1,798 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Nov  9 13:27:44 2018
			
 
				+
			
 
				+@author: tanja
			
 
				+@description: Implementation of machine learning
			
 
				+                pipeline selection and tuning with hyperopt library
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import gc
			
 
				+import logging
			
 
				+import pickle
			
 
				+import time
			
 
				+import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+
			
 
				+from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
			
 
				+    space_eval, pyll
			
 
				+
			
 
				+from sklearn.model_selection import cross_validate
			
 
				+
			
 
				+
			
 
				+class HyperoptPipelineSelection:
			
 
				+    '''
			
 
				+    Use this class to perform a search
			
 
				+    for a machine learning pipeline in a given parameter space.
			
 
				+    The parameter space can include multiple types of Pipelines
			
 
				+    (SVM, XGBOOST, random forest, etc),
			
 
				+    as well as parameter distributions for each pipeline parameter.
			
 
				+    See example in main for the expected space structure.
			
 
				+
			
 
				+    The search can be performed either randomly
			
 
				+    or with a tree-based algorithm. (Other methods are currently
			
 
				+    developped by hyperopt creators).
			
 
				+
			
 
				+    Attribute trials is responsible for book-keeping parameter
			
 
				+    combinations that have already been tried out. This attribute
			
 
				+    is saved to a binary file every n minutes as well as every time
			
 
				+    a better pipeline was found.
			
 
				+    '''
			
 
				+    def __init__(self,
			
 
				+                 cost_func,
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: int = 1,
			
 
				+                 log_path: str = None,
			
 
				+                 averaging_func: callable = None):
			
 
				+        '''
			
 
				+        :param callable cost_func: function to minimize or maximize
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from
			
 
				+            the beginning.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+
			
 
				+        :param str log_path: Optional, when not provided logs to stdout.
			
 
				+
			
 
				+        :param callable averaging_func: optional,
			
 
				+            when not provided set to mean. Function
			
 
				+            to aggregate the cross-validated values of the cost function.
			
 
				+            Classic situation is to take the mean,
			
 
				+            another example is, for example mean() - c*var().
			
 
				+        '''
			
 
				+
			
 
				+        assert(callable(cost_func)),\
			
 
				+            "Parameter 'cost_func' must be a callable"
			
 
				+
			
 
				+        assert(isinstance(greater_is_better, bool)),\
			
 
				+            "Parameter 'greater_is_better' must be bool type"
			
 
				+
			
 
				+        assert(isinstance(trials_path, str)),\
			
 
				+            "Parameter 'trials_path' must be of string type"
			
 
				+
			
 
				+        if averaging_func is not None:
			
 
				+            assert(callable(averaging_func)),\
			
 
				+                "Parameter 'averaging_func' must be a callable"
			
 
				+
			
 
				+        self._assert_valid_directory(path=trials_path)
			
 
				+
			
 
				+        self._configer_logger(log_path)
			
 
				+
			
 
				+        self._cost_func = cost_func
			
 
				+        # is 1 when cost_func is minimized, -1 when cost func is maximized
			
 
				+        self._score_factor = (not greater_is_better) - greater_is_better
			
 
				+        self._trials_path = trials_path
			
 
				+        # is initialized with empty trials object
			
 
				+        self._trials = Trials()
			
 
				+        self._backup_trials_freq = backup_trials_freq
			
 
				+        self._averaging_func = averaging_func or np.mean
			
 
				+        # keeping track of the current search iteration
			
 
				+        self._run_number = 0
			
 
				+        # space and data need to be attached to perform search.
			
 
				+        self._space_attached = False
			
 
				+        self._data_attached = False
			
 
				+
			
 
				+        # if a trials object already exists at the given path,
			
 
				+        # it is loaded and the search is continued. Else,
			
 
				+        # the search is started from the beginning.
			
 
				+        if os.path.isfile(trials_path):
			
 
				+            try:
			
 
				+                with open(trials_path, "rb") as f:
			
 
				+                    self._trials = pickle.load(f)
			
 
				+
			
 
				+                self._logger.info(("Loaded an existing trials object"
			
 
				+                                   "Consisting of {} trials")
			
 
				+                                  .format(len(self._trials.trials)))
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                self._logger.error(("Trials object could not be loaded. "
			
 
				+                                    "Training starts from the beginning. "
			
 
				+                                    "Exit with error {}").format(e))
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.info(("No existing trials object was found"
			
 
				+                               "Initialized an empty trials object."))
			
 
				+
			
 
				+        self._best_score = self.best_trial_score
			
 
				+
			
 
				+    def _configer_logger(self, log_path: str = None):
			
 
				+        '''
			
 
				+        Can be replaced with the existing script later.
			
 
				+        When log_path is not provided, logs to stdout.
			
 
				+        '''
			
 
				+
			
 
				+        self._logger = logging.getLogger(__name__)
			
 
				+
			
 
				+        if (self._logger.hasHandlers()):
			
 
				+            self._logger.handlers.clear()
			
 
				+
			
 
				+        if log_path is not None:
			
 
				+            assert(isinstance(log_path, str)),\
			
 
				+                "Parameter 'log_path' must be of string type"
			
 
				+            self._assert_valid_directory(log_path)
			
 
				+
			
 
				+            handler = logging.FileHandler(log_path)
			
 
				+        else:
			
 
				+            handler = logging.StreamHandler(sys.stdout)
			
 
				+
			
 
				+        formatter = logging.Formatter(
			
 
				+                '\n %(asctime)s %(levelname)s %(message)s')
			
 
				+
			
 
				+        handler.setFormatter(formatter)
			
 
				+        self._logger.addHandler(handler)
			
 
				+        self._logger.setLevel("INFO")
			
 
				+
			
 
				+    def _backup_trials(self):
			
 
				+        '''
			
 
				+        Pickles (Saves) the trials object.
			
 
				+        Used in a scheduler.
			
 
				+        '''
			
 
				+        with open(self._trials_path, "wb") as f:
			
 
				+            pickle.dump(self._trials, f)
			
 
				+
			
 
				+    def _assert_valid_directory(self, path: str):
			
 
				+        '''
			
 
				+        If the directory of a path does not exist yet,
			
 
				+        creates it.
			
 
				+        '''
			
 
				+        assert(isinstance(path, str)),\
			
 
				+            "Parameter 'path' must of str type"
			
 
				+
			
 
				+        dirname = os.path.dirname("path")
			
 
				+
			
 
				+        if len(dirname) > 0:
			
 
				+            os.mkdir(dirname, exists_ok=True)
			
 
				+
			
 
				+    def attach_space(self, space: pyll.base.Apply = None,
			
 
				+                     module_path: str = None,
			
 
				+                     name: str = None):
			
 
				+        '''
			
 
				+        :param pyll.base.Apply space: hyperopt space where
			
 
				+            the search is performed. Optional when a space
			
 
				+            is loaded from a python module.
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the space is defined. Optional when
			
 
				+            the space is provided directly.
			
 
				+
			
 
				+        :param str name: name of the space loaded from
			
 
				+            a python module. Optional when the space
			
 
				+            is provided directly.
			
 
				+        '''
			
 
				+        assert((space is not None) or
			
 
				+               ((module_path is not None) and (name is not None))),\
			
 
				+            "Either space or (module_path, name) must be provided"
			
 
				+
			
 
				+        if space is None:
			
 
				+            for p in ["modele_path", "name"]:
			
 
				+                assert(isinstance(p, str)),\
			
 
				+                    "Parameter '{}' must be of str type".format(p)
			
 
				+
			
 
				+            assert(os.path.isfile(module_path)),\
			
 
				+                "Parameter 'module_path' must be a valid file"
			
 
				+
			
 
				+            module, extension = os.path.splitext(os.path.basename(module_path))
			
 
				+            assert(extension == ",py"),\
			
 
				+                "Parameter 'space' must be read from a python file"
			
 
				+
			
 
				+            sys.path.insert(module_path)
			
 
				+
			
 
				+            try:
			
 
				+                from module import name as space
			
 
				+            except ImportError:
			
 
				+                err = "Invalid space location or name"
			
 
				+                self._logger.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+        assert(isinstance(space, pyll.base.Apply)),\
			
 
				+            "Parameter 'space' must be of hyperopt space type"
			
 
				+
			
 
				+        self._space = space
			
 
				+        self._logger.info("Attached parameter distribution space")
			
 
				+        self._space_attached = True
			
 
				+
			
 
				+    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
			
 
				+            -> np.ndarray:
			
 
				+        '''
			
 
				+        Converts an DataFrame to an numpy array.
			
 
				+        '''
			
 
				+        if isinstance(x, np.ndarray):
			
 
				+            return x
			
 
				+
			
 
				+        elif (isinstance(x, pd.core.frame.DataFrame))\
			
 
				+                or (isinstance(x, pd.core.series.Series)):
			
 
				+            return x.values
			
 
				+
			
 
				+        else:
			
 
				+            e = 'The argument must be a numpy array or a pandas DataFrame'
			
 
				+            self._logger.critical(e)
			
 
				+            raise ValueError(e)
			
 
				+
			
 
				+    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
			
 
				+                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				+                    X_val: (pd.DataFrame, np.ndarray) = None,
			
 
				+                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				+                    cv: (list, int) = None):
			
 
				+        '''
			
 
				+        :param array X_train: data on which
			
 
				+            machine learning pipelines are trained
			
 
				+
			
 
				+        :param array y_train: optional, vector with targets,
			
 
				+            (not all algorithms require a targets)
			
 
				+
			
 
				+        :param array X_val: optional, validation data.
			
 
				+            When not provided, cross-validated value
			
 
				+            of the cost_func is calculated.
			
 
				+
			
 
				+        :param array y_val: optional, validation targets
			
 
				+
			
 
				+        :param list cv: list of tuples containing
			
 
				+            train and validation indices or an integer representing
			
 
				+            the number of folds for a random split of data
			
 
				+            during cross-validation
			
 
				+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
			
 
				+        '''
			
 
				+
			
 
				+        X_train = self._convert_to_array(X_train)
			
 
				+        if y_train is not None:
			
 
				+            y_train = self._convert_to_array(y_train)
			
 
				+
			
 
				+        if X_val is not None:
			
 
				+            if cv is not None:
			
 
				+                self._logger.warning(("Both validation set and cv object "
			
 
				+                                      "are set. Validation score will be "
			
 
				+                                      "calculated on the validation set!"))
			
 
				+
			
 
				+            X_val = self._convert_to_array(X_val)
			
 
				+
			
 
				+            train_inds = list(range(len(X_train)))
			
 
				+            val_inds = list(range(len(X_train),
			
 
				+                                  len(X_train) + len(X_val)))
			
 
				+
			
 
				+            # cost is evaluated with a cross validation function
			
 
				+            # that accepts an array and a cv object with
			
 
				+            # indices of the fold splits.
			
 
				+            # Here we create a trivial cv object
			
 
				+            # with one validation split.
			
 
				+            self._cv = [(train_inds, val_inds)]
			
 
				+            self._X = np.concatenate([X_train, X_val])
			
 
				+
			
 
				+            if y_train is not None:
			
 
				+                if y_val is None:
			
 
				+                    err = "Argument y_val must be provided"
			
 
				+                    self._logger.critical(err)
			
 
				+                    raise ValueError(err)
			
 
				+                else:
			
 
				+                    y_val = self._convert_to_array(y_val)
			
 
				+                    self._y = np.concatenate([y_train, y_val])
			
 
				+            else:
			
 
				+                self._y = None
			
 
				+        else:
			
 
				+            if cv is None:
			
 
				+                self._logger.warning(("Neither validation set nor cv object "
			
 
				+                                      "are set. Validation score will be "
			
 
				+                                      "calculated on 5 randomly "
			
 
				+                                      "splitted folds."))
			
 
				+
			
 
				+            self._X = X_train
			
 
				+            self._y = y_train
			
 
				+            self._cv = cv
			
 
				+
			
 
				+        self._logger.info("Attached data")
			
 
				+        self._data_attached = True
			
 
				+
			
 
				+    def _evaluate(self, pipeline: Pipeline) -> dict:
			
 
				+        '''
			
 
				+        This method is called in _objective.
			
 
				+
			
 
				+        Calculates the cost on the attached data.
			
 
				+        This function can be overriden, when the cost
			
 
				+        needs to be calculated differently,
			
 
				+        for example with a tensorflow model.
			
 
				+
			
 
				+        :param Pipeline pipeline: machine learning pipeline
			
 
				+            that will be evaluated with cross-validation
			
 
				+
			
 
				+        :output: dictionary with the aggregated
			
 
				+            cross-validation score and
			
 
				+            the score variance.
			
 
				+        '''
			
 
				+
			
 
				+        scores = cross_validate(estimator=pipeline,
			
 
				+                                X=self._X,
			
 
				+                                y=self._y,
			
 
				+                                cv=self._cv or 5,
			
 
				+                                scoring=make_scorer(self._cost_func),
			
 
				+                                error_score=np.nan)
			
 
				+
			
 
				+        return {'value': self._averaging_func(scores['test_score']),
			
 
				+                'variance': np.var(scores['test_score'])}
			
 
				+
			
 
				+    def _objective(self, space_element: dict) -> dict:
			
 
				+        '''
			
 
				+        This method is called in search_for_best_pipeline
			
 
				+        inside the hyperopt fmin method.
			
 
				+
			
 
				+        Uses _evaluate method.
			
 
				+
			
 
				+        It must take as input a space element
			
 
				+        and produce an output in the form of dictionary
			
 
				+        with 2 obligatory values loss and status
			
 
				+        (STATUS_OK or STATUS_FAIL). Other
			
 
				+        values in the output are optional and can be
			
 
				+        accessed later through the trials object.
			
 
				+
			
 
				+        :Warning: fmin minimizes the loss,
			
 
				+        when _evaluate returns a value to be maximized,
			
 
				+        it should be multiplied by -1 to obtain loss.
			
 
				+
			
 
				+        :param dict space_element: must contain keys
			
 
				+            name (with the name of the pipeline),
			
 
				+            pipeline (Pipeline object),
			
 
				+            params (dict of pipeline params)
			
 
				+
			
 
				+        :output: dictionary with keys
			
 
				+            loss (minimized value),
			
 
				+            status with values STATUS_OK or STATUS_FAIL
			
 
				+            uderstood by hyperopt,
			
 
				+            score (equal to loss or -loss),
			
 
				+            score_variance,
			
 
				+            timestamp (end of execution),
			
 
				+            train_time: execution time
			
 
				+        '''
			
 
				+        assert(isinstance(space_element, dict) and
			
 
				+               set(['name', 'pipeline', 'params']) <= space_element.keys())
			
 
				+
			
 
				+        assert(isinstance(space_element['name'], str) and
			
 
				+               isinstance(space_element['pipeline'], Pipeline) and
			
 
				+               isinstance(space_element['params'], dict))
			
 
				+
			
 
				+        start_time = time.time()
			
 
				+
			
 
				+        if not self._data_attached:
			
 
				+            raise Exception(("Data must be attached in order "
			
 
				+                             "in order to effectuate the best"
			
 
				+                             "pipeline search"))
			
 
				+
			
 
				+        self._run_number += 1
			
 
				+
			
 
				+        pipeline = space_element['pipeline']
			
 
				+        params = space_element['params']
			
 
				+        pipeline.set_params(**params)
			
 
				+
			
 
				+        self._logger.info(("Run number {0}: "
			
 
				+                           "Current score is {1}: "
			
 
				+                           "Training pipeline {2} "
			
 
				+                           "with parameters: {3}. ").format(
			
 
				+                             self._run_number,
			
 
				+                             self._best_score,
			
 
				+                             space_element['name'],
			
 
				+                             params))
			
 
				+
			
 
				+        try:
			
 
				+            score_stats = self._evaluate(pipeline)
			
 
				+            assert(not np.isnan(score_stats["value"])),\
			
 
				+                "Returned null score"
			
 
				+
			
 
				+            if self._run_number % self._backup_trials_freq == 0:
			
 
				+                self._backup_trials()
			
 
				+
			
 
				+            if (self._best_score != self._best_score) or\
			
 
				+                self._score_factor*score_stats["value"] <\
			
 
				+                    self._score_factor*self._best_score:
			
 
				+
			
 
				+                self._logger.info("Score got better, new best score is: {}"
			
 
				+                                  .format(score_stats["value"]))
			
 
				+
			
 
				+                self._best_score = score_stats['value']
			
 
				+
			
 
				+                self._backup_trials()
			
 
				+
			
 
				+            end_time = time.time()
			
 
				+
			
 
				+            return {'loss': self._score_factor * score_stats["value"],
			
 
				+                    'status': STATUS_OK,
			
 
				+                    'score': score_stats["value"],
			
 
				+                    'score_variance': score_stats["variance"],
			
 
				+                    'timestamp': datetime.datetime.today(),
			
 
				+                    'train_time': end_time - start_time}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+
			
 
				+            self._logger.warning("Trial failed with error {}".format(e))
			
 
				+
			
 
				+            return {'loss': np.nan,
			
 
				+                    'status': STATUS_FAIL,
			
 
				+                    'score': np.nan,
			
 
				+                    'score_variance': np.nan,
			
 
				+                    'timestamp': datetime.datetime.today(),
			
 
				+                    'train_time': np.nan}
			
 
				+
			
 
				+    def search_for_best_pipeline(self,
			
 
				+                                 niter: int,
			
 
				+                                 algo: callable = tpe.suggest):
			
 
				+        '''
			
 
				+        Method performing the search of the best pipeline in the given space.
			
 
				+        Calls fmin function from the hyperopt library to minimize the output of
			
 
				+        _objective.
			
 
				+
			
 
				+        :params int niter: number of search iterations
			
 
				+        :param callable algo: now can only take values tpe for a tree-based
			
 
				+            random search or random for random search
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        assert(isinstance(niter, int)),\
			
 
				+            "Parameter 'niter' must be of int type"
			
 
				+
			
 
				+        # right now only two algorithms are provided by
			
 
				+        assert(algo in [tpe.suggest, rand.suggest]),\
			
 
				+            ("Parameter 'algo' can be now only tpe or random. "
			
 
				+             "If other algorithms have been developped by "
			
 
				+             "by hyperopt, plased add them to the list.")
			
 
				+
			
 
				+        try:
			
 
				+            self._logger.info(("Starting {0} iterations of search "
			
 
				+                               "additional to {1} previous"
			
 
				+                               .format(niter, len(self._trials.trials))))
			
 
				+
			
 
				+            best = fmin(fn=self._objective,
			
 
				+                        space=space,
			
 
				+                        algo=algo,
			
 
				+                        trials=self._trials,
			
 
				+                        max_evals=len(self._trials.trials) + niter)
			
 
				+
			
 
				+            # print('AAAA', str(niter))
			
 
				+
			
 
				+            self._logger.info(
			
 
				+                    "Best score is {0} with variance {1}"
			
 
				+                    .format(
			
 
				+                     self._trials.best_trial["result"]["score"],
			
 
				+                     self._trials.best_trial["result"]["score_variance"]))
			
 
				+
			
 
				+            self._logger.info(("Finished {0} iterations of search.\n"
			
 
				+                               "Best parameters are:\n {1} ")
			
 
				+                              .format(niter,
			
 
				+                                      space_eval(space, best)))
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            raise ValueError(("Failed to select best "
			
 
				+                             "pipeline! Exit with error: {}").format(e))
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            return self._trials.best_trial["result"]["score"]
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            return self._trials.best_trial["result"]["score_variance"]
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+
			
 
				+            return space_eval(
			
 
				+                    space,
			
 
				+                    {k: v[0] for k, v in
			
 
				+                     self._trials.best_trial['misc']['vals'].items()
			
 
				+                     if len(v) > 0})["pipeline"]
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def _ith_trial_loss(self, i: int) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return self._trials.trials[i]['result']['loss']
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    def _ith_trial_element(self, i: int, name: str) -> object:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return space_eval(self._space,
			
 
				+                              {k: v[0] for k, v in
			
 
				+                               self._trials.trials[i]['misc']['vals']
			
 
				+                               .items() if len(v) > 0})[name]
			
 
				+
			
 
				+    def _ith_trial_pipeline(self, i: int) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='pipeline')
			
 
				+
			
 
				+    def _ith_trial_name(self, i: int) -> str:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='name')
			
 
				+
			
 
				+    def _ith_trial_params(self, i: int) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='params')
			
 
				+
			
 
				+    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return self._trials.trials[i]["result"]["timestamp"]
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
			
 
				+        '''
			
 
				+        Returns the list of n best pipelines
			
 
				+        documented in trials
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            if losses is None:
			
 
				+                losses = [self._ith_trial_loss(i)
			
 
				+                          for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+            best_n_indices = [losses.index(l)
			
 
				+                              for l in sorted(list(set(losses)))[:n]]
			
 
				+
			
 
				+            return [self._ith_trial_pipeline(i) for i in best_n_indices]
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
			
 
				+        '''
			
 
				+        Returns a dictiionry where keys are pipeline names,
			
 
				+        and values are lists of best pipelines with this name
			
 
				+        '''
			
 
				+        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
			
 
				+
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+
			
 
				+            best_pipelines_per_type = {}
			
 
				+            names = [self._ith_trial_name(i)
			
 
				+                     for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+            for nm in names:
			
 
				+                losses = [self._ith_trial_loss(i)
			
 
				+                          for i in range(len(self._trials.trials))
			
 
				+                          if self._ith_trial_name(i) == nm]
			
 
				+
			
 
				+                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
			
 
				+                                                        n=n,
			
 
				+                                                        losses=losses)
			
 
				+
			
 
				+            return best_pipelines_per_type
			
 
				+
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def write_trials_documentation(self, path: str = None):
			
 
				+        '''
			
 
				+        Saves an excel file with pipeline names, scores,
			
 
				+        parameters, and timestamps.
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            path = path or "hyperopt_trials_documentation.xlsx"
			
 
				+
			
 
				+            assert(isinstance(path, str)),\
			
 
				+                "Parameter 'path' must be of string type"
			
 
				+
			
 
				+            self._assert_valid_directory(path)
			
 
				+
			
 
				+            names = [self._ith_trial_name(i)
			
 
				+                     for i in range(len(self._trials.trials))]
			
 
				+            scores = [self._score_factor*self._ith_trial_loss(i)
			
 
				+                      for i in range(len(self._trials.trials))]
			
 
				+            params = [self._ith_trial_params(i)
			
 
				+                      for i in range(len(self._trials.trials))]
			
 
				+            timestamps = [self._ith_trial_timestamp(i)
			
 
				+                          for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+        else:
			
 
				+            names = []
			
 
				+            scores = []
			
 
				+            params = []
			
 
				+            timestamps = []
			
 
				+
			
 
				+        pd.DataFrame({"name": names,
			
 
				+                      "score": scores,
			
 
				+                      "params": params,
			
 
				+                      "timestamp": timestamps})\
			
 
				+          .to_excel(path)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    from sklearn.metrics import roc_auc_score, make_scorer
			
 
				+    from xgboost import XGBClassifier
			
 
				+    from sklearn.svm import SVC
			
 
				+    from sklearn.feature_selection import SelectKBest
			
 
				+    from sklearn.decomposition import PCA
			
 
				+    from sklearn.datasets import load_iris
			
 
				+    from pprint import pprint
			
 
				+
			
 
				+    data = load_iris()
			
 
				+    X = pd.DataFrame(data.data)
			
 
				+    y = pd.Series(data.target)
			
 
				+    # produce a binory variable
			
 
				+    y = (y == 2).astype(int)
			
 
				+    del data
			
 
				+    gc.collect()
			
 
				+
			
 
				+    # SPACE DEFINITION ########################################
			
 
				+    # (can be moved to a separate python script)
			
 
				+
			
 
				+    """
			
 
				+    A search space must be a list of dictionaries.
			
 
				+    Each dictionry must have keys:
			
 
				+        name (pipeline name or type),
			
 
				+        pipeline (instance of sklearn.pipeline.Pipeline),
			
 
				+        params (dictionary of distributions for the parameters of
			
 
				+                the pipeline that we want to tune)
			
 
				+
			
 
				+    Here we have a space that consists of two dictionaries:
			
 
				+    KBEST_XGBOOST and PCA_SVC
			
 
				+    """
			
 
				+    space = []
			
 
				+
			
 
				+    pipeline_dist_1 = {}
			
 
				+    pipeline_dist_1["name"] = "KBEST_XGBOOST"
			
 
				+
			
 
				+    """
			
 
				+    A pipeline consists of steps (tuples).
			
 
				+    Each step has a name and an algorithm.
			
 
				+    This pipeline, as a first step performs
			
 
				+    feature selection with SelectKBest and
			
 
				+    as a second step evaluates a machine learning algo (xgboost).
			
 
				+
			
 
				+    Like all sklearn algorithms, a Pipeline has methods
			
 
				+    fit, predict, set_params, get_params
			
 
				+    """
			
 
				+    pipeline_dist_1["pipeline"] = Pipeline([
			
 
				+                                     ('kbest', SelectKBest()),
			
 
				+                                     ('xgb', XGBClassifier())
			
 
				+                                     ])
			
 
				+    """
			
 
				+    Pipeline parameter dictionaries must be of the form:
			
 
				+    {'kbest__k': 3, xgb__n_estimators: 20},
			
 
				+    each parameter name consists of the step name, __, and parameter name.
			
 
				+
			
 
				+    Here, instead of values, the parameter names are followed
			
 
				+    by hyperopt distributions.
			
 
				+    Each hyperopt distribution also must have a name,
			
 
				+    due to hyperopt functionality.
			
 
				+
			
 
				+    Here, we set the hyperopt distribution name to the step name,
			
 
				+    but it does not have to be so. Hyperopt distribution names
			
 
				+    must be different for different elements of the space.
			
 
				+    """
			
 
				+
			
 
				+    pipeline_dist_1["params"] = {
			
 
				+            'kbest__k': hp.choice('kbest__k', range(1, 5)),
			
 
				+
			
 
				+            'xgb__n_estimators':
			
 
				+            50 + hp.randint('xgb__n_estimators', 50),
			
 
				+
			
 
				+            "xgb__learning_rate":
			
 
				+            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
			
 
				+            }
			
 
				+
			
 
				+    space.append(pipeline_dist_1)
			
 
				+
			
 
				+    pipeline_dist_2 = {}
			
 
				+    pipeline_dist_2["name"] = "PCA_SVC"
			
 
				+
			
 
				+    pipeline_dist_2["pipeline"] = Pipeline([
			
 
				+                                     ('pca', PCA()),
			
 
				+                                     ('svc', SVC(gamma="scale"))
			
 
				+                                     ])
			
 
				+
			
 
				+    pipeline_dist_2["params"] = {
			
 
				+            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
			
 
				+
			
 
				+            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
			
 
				+            }
			
 
				+
			
 
				+    space.append(pipeline_dist_2)
			
 
				+
			
 
				+    space = hp.choice('pipelines', space)
			
 
				+
			
 
				+    # TESTING ##########################################################
			
 
				+
			
 
				+    trials_path = 'TEST_hyperopt_trials.pkl'
			
 
				+
			
 
				+    doc_path = 'TEST_hyperopt_doc.xlsx'
			
 
				+
			
 
				+    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
			
 
				+                                       greater_is_better=True,
			
 
				+                                       trials_path=trials_path)
			
 
				+
			
 
				+    hp_obj.attach_data(X_train=X, y_train=y)
			
 
				+
			
 
				+    hp_obj.attach_space(space=space)
			
 
				+
			
 
				+    hp_obj.search_for_best_pipeline(niter=10)
			
 
				+
			
 
				+    print('\n', '='*20, 'TESTING', '='*20)
			
 
				+
			
 
				+    print('\n', 'Best score:', hp_obj.best_trial_score)
			
 
				+
			
 
				+    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
			
 
				+
			
 
				+    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
			
 
				+
			
 
				+    print('\n', 'Best 3 pipelines: \n')
			
 
				+    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
			
 
				+
			
 
				+    print('\n', 'Best pipeline per type: \n')
			
 
				+    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
			
 
				+
			
 
				+    hp_obj.write_trials_documentation(path=doc_path)
			
 
				+
			
 
				+    # os.remove(doc_path)
			
 
				+    # os.remove(trials_path)
			
--- a/db_handlers/MongodbHandler.py
+++ b/db_handlers/MongodbHandler.py
@@ -0,0 +1,211 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+Created on Mon Sep 16 13:27:44 2019
			
 
				+
			
 
				+@author: oskar
			
 
				+@description: Implementation of a database handler for abstraction of the mongodb.
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+import json
			
 
				+import simplejson
			
 
				+import sys
			
 
				+import os
			
 
				+import jsonref
			
 
				+
			
 
				+from copy import deepcopy
			
 
				+from pymongo import MongoClient
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+from libraries.log import Log
			
 
				+from libraries.configuration import default as cfg
			
 
				+
			
 
				+class MongodbHandler:
			
 
				+
			
 
				+    '''
			
 
				+
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self, database_url: str = cfg['MONGO_DB']['URI'],
			
 
				+                 database_name: str = cfg['MONGO_DB']['DATABASE_NAME']):
			
 
				+        '''
			
 
				+        :param str database_url: Url for the mongodb database
			
 
				+        :param str database_name: Name of the database the database handler should handle
			
 
				+        '''
			
 
				+        assert(isinstance(database_url, str)),\
			
 
				+            "Parameter 'database_url' must be a string type"
			
 
				+        assert(isinstance(database_name, str)),\
			
 
				+            "Parameter 'database_name' must be a string type"
			
 
				+
			
 
				+        self._log = Log("\nMongodbHandler script")
			
 
				+
			
 
				+        self._log.info('Mongodb Handler has been initialized')
			
 
				+        # Connect to the MongoDB
			
 
				+        self._client = MongoClient(database_url)
			
 
				+        # Connect to the oebb_db database, or create it if it doesnt exist.
			
 
				+        self._database = self._client[database_name]
			
 
				+
			
 
				+    def _read_schema(self, schema_path: str) -> dict:
			
 
				+        '''
			
 
				+        :param str schema_path: path to the schema file.
			
 
				+        '''
			
 
				+
			
 
				+        assert(isinstance(schema_path, str)),\
			
 
				+            "Parameter 'schema_path must be a string type"
			
 
				+
			
 
				+        with open(schema_path) as json_file:
			
 
				+            schema = json.load(json_file)
			
 
				+
			
 
				+        if 'definitions' in schema:
			
 
				+            schema = self._dereference_schema(schema)
			
 
				+
			
 
				+        return schema
			
 
				+
			
 
				+    def _dereference_schema(self, schema: dict) -> dict:
			
 
				+        '''
			
 
				+        :param dict schema: dictionary containing a schema which uses references.
			
 
				+        '''
			
 
				+
			
 
				+        assert(isinstance(schema, dict)),\
			
 
				+            "Parameter 'schema' must be a dictionary type"
			
 
				+
			
 
				+        schema = jsonref.loads(str(schema).replace("'", "\""))
			
 
				+        schema = deepcopy(schema)
			
 
				+        schema.pop('definitions', None)
			
 
				+        return schema
			
 
				+
			
 
				+    def set_collection_schema(self, collection_name: str, schema_path: str,
			
 
				+                              validation_level: str = 'moderate',validation_action: str = 'error'):
			
 
				+        '''
			
 
				+        :param str collection_name: name on the collection for which the schema will be set.
			
 
				+        :param str schema_path: path to the schema file.
			
 
				+        :param str validation_level: level of validation done by the mongodb.
			
 
				+        :param str validation_action: what will happen upon validation error, warning or error message.
			
 
				+        '''
			
 
				+        assert(isinstance(collection_name, str)),\
			
 
				+            "Parameter 'collection_name' must be a string type"
			
 
				+        assert(isinstance(schema_path, str)),\
			
 
				+            "Parameter 'schema_path' must be a string type"
			
 
				+        assert(isinstance(validation_level, str)),\
			
 
				+            "Parameter 'validation_lever' must be a string type"
			
 
				+        assert(isinstance(validation_action, str)),\
			
 
				+            "Parameter 'validation_action' must be a string type"
			
 
				+
			
 
				+        schema = self._read_schema(schema_path)
			
 
				+
			
 
				+        command = {
			
 
				+                    'collMod': collection_name,
			
 
				+                    'validator': {
			
 
				+                        '$jsonSchema': schema
			
 
				+                    },
			
 
				+                    'validationLevel': validation_level,
			
 
				+                    'validationAction': validation_action
			
 
				+                    }
			
 
				+
			
 
				+        self._database.command(command)
			
 
				+
			
 
				+    def create_collection(self, collection_name):
			
 
				+        '''
			
 
				+        :param str collection_name: name of the collection to be created.
			
 
				+        '''
			
 
				+
			
 
				+        assert(isinstance(collection_name, str)),\
			
 
				+            "Parameter 'collection_name' must be a string type"
			
 
				+
			
 
				+        if collection_name not in self._database.list_collection_names():
			
 
				+            self._log.info(("Collection '{}' has been created").format(collection_name))
			
 
				+            return self._database.create_collection(collection_name)
			
 
				+        else:
			
 
				+            self._log.info(("Collection '{}' already exists").format(collection_name))
			
 
				+            return self._database[collection_name]
			
 
				+
			
 
				+    def insert_data_into_collection(self, data: (dict, list, np.ndarray, pd.DataFrame, pd.Series),
			
 
				+                                    collection_name: str,
			
 
				+                                    ordered: bool = False):
			
 
				+        '''
			
 
				+        :param dict data: dictionary containing the data to be inserted in the collection
			
 
				+        :param pymongo.database.Collection collection: The collection the data will be added to.
			
 
				+        '''
			
 
				+
			
 
				+        allowed_types = (dict, list, np.ndarray, pd.DataFrame, pd.Series)
			
 
				+
			
 
				+        assert(isinstance(data, allowed_types)),\
			
 
				+            "Parameter 'data' is of invalid type"
			
 
				+
			
 
				+        if isinstance(data, np.ndarray):
			
 
				+            data = pd.DataFrame(data)
			
 
				+
			
 
				+        if isinstance(data, pd.DataFrame):
			
 
				+
			
 
				+            data = simplejson.loads(data.to_json(orient="records",
			
 
				+                                                 date_format="iso"))
			
 
				+
			
 
				+        elif isinstance(data, pd.Series):
			
 
				+
			
 
				+            data = simplejson.loads(data.to_json(date_format="iso"))
			
 
				+
			
 
				+        if (len(data) == 1) or (isinstance(data, dict)):
			
 
				+
			
 
				+            if isinstance(data, pd.DataFrame) and (len(data) == 1):
			
 
				+                data = data.iloc[0]
			
 
				+
			
 
				+            self._database[collection_name].insert_one(data)
			
 
				+        else:
			
 
				+            self._database[collection_name].insert_many(data, ordered=ordered)
			
 
				+
			
 
				+        self._log.info(('Data has been inserted into the {} collection').format(collection_name))
			
 
				+
			
 
				+    def create_collection_and_set_schema(self, collection_name: str, schema_path: str):
			
 
				+        '''
			
 
				+        :param str collection_name: name of the collection to be created.
			
 
				+        :param str schema_path: path to the schema file.
			
 
				+        '''
			
 
				+        assert(isinstance(collection_name, str)),\
			
 
				+            "Parameter 'collection_name' must be a string type"
			
 
				+        assert(isinstance(schema_path, str)),\
			
 
				+            "Parameter 'schema_path' must be a string type"
			
 
				+
			
 
				+        self.create_collection(collection_name)
			
 
				+        self.set_collection_schema(collection_name=collection_name, schema_path=schema_path)
			
 
				+
			
 
				+    def query_data_and_generate_dataframe(self, collection_name: str, attribute: str = None,
			
 
				+                                          attribute_value: str = None, comparison_operator: str = '$eq'):
			
 
				+        '''
			
 
				+
			
 
				+        '''
			
 
				+        if attribute is None or attribute_value is None:
			
 
				+            data = self._database[collection_name].find()
			
 
				+        else:
			
 
				+            data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}})
			
 
				+
			
 
				+        df = pd.DataFrame(list(data))
			
 
				+        df.set_index('radsatznummer', inplace=True)
			
 
				+        return df
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    log = Log("Test MongodbHandler:")
			
 
				+
			
 
				+    log.info('Script started')
			
 
				+
			
 
				+    db_handler = MongodbHandler()
			
 
				+
			
 
				+    # Create a colleciton for the wheelsets and give it its schema.
			
 
				+    for schema_path in [
			
 
				+            os.path.join(".", "mongo_schema", "schema_wheelsets.json"),
			
 
				+            os.path.join(".", "mongo_schema", "schema_process_instances.json"),
			
 
				+            os.path.join(".", "mongo_schema", "schema_componets.json")]:
			
 
				+
			
 
				+        if os.path.isfile(schema_path):
			
 
				+
			
 
				+            collection_name = os.path.basename(schema_path).lstrip("_schema").split(".")[0]
			
 
				+
			
 
				+            db_handler.create_collection_and_set_schema(collection_name, schema_path)
			
 
				+
			
 
				+    log.info(("Existing databases: {}, Collection in OEBB database {}")\
			
 
				+             .format(db_handler._client.list_database_names(), db_handler._database.list_collection_names()))
			
--- a/db_handlers/SQLHandler.py
+++ b/db_handlers/SQLHandler.py
@@ -0,0 +1,595 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Tue Sep 18 16:20:50 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import re
			
 
				+import sqlalchemy
			
 
				+import sqlparse
			
 
				+import pandas as pd
			
 
				+import warnings
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class SQLHandler:
			
 
				+    '''
			
 
				+    Resembles methods for executing sql queries
			
 
				+    with different dabase connectors.
			
 
				+    Remark:in each method we force new opening and
			
 
				+    closing of a database connection,
			
 
				+     this avoids errors when parallelizing with multiprocessing.
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self, db_uri: str = None,
			
 
				+                 is_case_insensitive: bool = False):
			
 
				+        '''
			
 
				+        :param str db_uri:
			
 
				+            of form
			
 
				+            <sqlalchemy_dialect//user:password@host:port/dbname?charset=utf8&local_infile=1>
			
 
				+
			
 
				+         sqlalchemy dialects:
			
 
				+             for mysql : mysql+pymysql
			
 
				+             for db2: ibm_db_sa
			
 
				+        '''
			
 
				+
			
 
				+        from libraries.log import Log
			
 
				+        from libraries.configuration import default as cfg
			
 
				+        from sqlalchemy_utils import database_exists, create_database
			
 
				+
			
 
				+        self._log = Log(name='SQLHandler')
			
 
				+
			
 
				+        if db_uri is None:
			
 
				+            db_uri = cfg["SQL_DB"]["URI"]
			
 
				+
			
 
				+        assert(isinstance(db_uri, str)),\
			
 
				+            "Parameter 'db_uri' must be of type str"
			
 
				+
			
 
				+        assert(re.match(r'.+://.+:(.+)?@.+:.+/.+', db_uri) is not None),\
			
 
				+            ('database url does not match the pattern: '
			
 
				+             'sqlalchemy_dialect//user:password@host:port/dbname')
			
 
				+
			
 
				+        self._db_uri = db_uri
			
 
				+
			
 
				+        engine = sqlalchemy.create_engine(self._db_uri)
			
 
				+
			
 
				+        if not database_exists(engine.url):
			
 
				+            create_database(engine.url)
			
 
				+
			
 
				+        query = "CREATE DATABASE IF NOT EXISTS {}"\
			
 
				+            .format(self._connection_params["db"])
			
 
				+
			
 
				+        with warnings.catch_warnings():
			
 
				+            warnings.simplefilter("ignore")
			
 
				+            engine.execute(query)
			
 
				+
			
 
				+        assert(isinstance(is_case_insensitive, bool)),\
			
 
				+            "Parameter 'is_case_sensetive' must of type bool"
			
 
				+
			
 
				+        if 'ibm' in db_uri and not is_case_insensitive:
			
 
				+            raise Exception('Ibm db2 is case insensitive')
			
 
				+
			
 
				+        self._is_case_insensitive = is_case_insensitive
			
 
				+
			
 
				+        self._engine = sqlalchemy.create_engine(self._db_uri)
			
 
				+
			
 
				+    @property
			
 
				+    def _connection_params(self) -> dict:
			
 
				+        '''
			
 
				+        return: connection parameters like user,
			
 
				+        password, host, port, and database name
			
 
				+        rtype: dict
			
 
				+        '''
			
 
				+        try:
			
 
				+            connection_params = {}
			
 
				+
			
 
				+            connection_params['user'], connection_params['password'] =\
			
 
				+                self._db_uri.split('//')[1]\
			
 
				+                            .split('@')[0]\
			
 
				+                            .split(':')
			
 
				+
			
 
				+            connection_params['host'], connection_params['port'] =\
			
 
				+                self._db_uri.split('//')[1]\
			
 
				+                            .split('@')[1]\
			
 
				+                            .split('/')[0]\
			
 
				+                            .split(':')
			
 
				+
			
 
				+            connection_params['db'] = self._db_uri.split('/')[-1]\
			
 
				+                                                  .split('?')[0]
			
 
				+
			
 
				+            return connection_params
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not parse connection parameters."
			
 
				+                   "Finished with error {}")\
			
 
				+                   .format(e)
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def drop_database(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        database = self._connection_params["db"]
			
 
				+        self.execute("DROP DATABASE IF EXISTS {}".format(database))
			
 
				+
			
 
				+    @property
			
 
				+    def _db_metadata(self) -> dict:
			
 
				+        '''
			
 
				+        Returns a sql-dialect specific information like information schema
			
 
				+        and columnames in information_schema.tables and
			
 
				+        information_schema.columns
			
 
				+        For ibm databases, information_schema is set to syscat,
			
 
				+        else it is set to information_schema
			
 
				+        If these default values do not exist in the given database,
			
 
				+        the output of the method is set to None
			
 
				+
			
 
				+        :return: dictionary with information_schema, schema_col,
			
 
				+            table_col, column_col, default_schema
			
 
				+        '''
			
 
				+
			
 
				+        db_metadata = {}
			
 
				+
			
 
				+        if 'ibm' in self._db_uri:
			
 
				+            db_metadata['information_schema'] = 'syscat'
			
 
				+            db_metadata['schema_col'] = 'tabschema'
			
 
				+            db_metadata['table_col'] = 'tabname'
			
 
				+            db_metadata['column_col'] = 'colname'
			
 
				+            db_metadata['default_schema'] =\
			
 
				+                self._connection_params['user'].upper()
			
 
				+        else:
			
 
				+            db_metadata['information_schema'] = 'information_schema'
			
 
				+            db_metadata['schema_col'] = 'TABLE_SCHEMA'
			
 
				+            db_metadata['table_col'] = 'TABLE_NAME'
			
 
				+            db_metadata['column_col'] = 'COLUMN_NAME'
			
 
				+            db_metadata['default_schema'] =\
			
 
				+                self._connection_params['db']
			
 
				+
			
 
				+        # check if it worked to create metadata
			
 
				+        try:
			
 
				+            query = """SELECT *
			
 
				+                       FROM {}.tables
			
 
				+                       LIMIT 1
			
 
				+                    """.format(db_metadata['information_schema'])
			
 
				+            self.execute(query)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._log.error(e)
			
 
				+            db_metadata = None
			
 
				+
			
 
				+        return db_metadata
			
 
				+
			
 
				+    def execute(self, query):
			
 
				+        '''
			
 
				+        Executes an sql-queries.
			
 
				+        Remark: queries like CREATE, DROP, SELECT work
			
 
				+        for majority of sqlalchemy dialects.
			
 
				+         queries like SHOW TABLES, LOAD DATA, and using
			
 
				+         INFORMATION_SCHEMA are mysql specific and might
			
 
				+         not exist in a different dialect.
			
 
				+
			
 
				+        :param str query:
			
 
				+        '''
			
 
				+        connection = self._engine.connect()
			
 
				+        transaction = connection.begin()
			
 
				+
			
 
				+        errors = []
			
 
				+
			
 
				+        # in the case of multi-query execute each query
			
 
				+        for sub_query in sqlparse.split(query):
			
 
				+            if len(sub_query) > 0:
			
 
				+                try:
			
 
				+                    connection.execute(sub_query, multi=True)
			
 
				+
			
 
				+                except Exception as e:
			
 
				+                    errors.append(str(e))
			
 
				+
			
 
				+        if len(errors) > 0:
			
 
				+            err = ('Could not execute some of the queries. '
			
 
				+                   'Obtained exceptions: {}'
			
 
				+                   .format('\n'.join(errors)))
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        transaction.commit()
			
 
				+        connection.close()
			
 
				+
			
 
				+    def execute_query_from_file(self, filename: str):
			
 
				+        '''
			
 
				+        '''
			
 
				+        with open(filename, 'r') as f:
			
 
				+            query = f.read()
			
 
				+
			
 
				+        self.execute(query)
			
 
				+
			
 
				+    def get_tablenames(self, schema: str = None, query: str = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if (self._db_metadata is None) and (query is None):
			
 
				+            raise Exception('Please specify the query')
			
 
				+
			
 
				+        else:
			
 
				+            try:
			
 
				+                if query is None:
			
 
				+                    schema_or_default_schema =\
			
 
				+                        self._db_metadata['default_schema']\
			
 
				+                        if schema is None else schema
			
 
				+
			
 
				+                    query = """SELECT DISTINCT {0}
			
 
				+                               FROM {1}.tables
			
 
				+                               WHERE {2} = '{3}'
			
 
				+                            """.format(
			
 
				+                            self._db_metadata['table_col'],
			
 
				+                            self._db_metadata['information_schema'],
			
 
				+                            self._db_metadata['schema_col'],
			
 
				+                            schema_or_default_schema)
			
 
				+
			
 
				+                tables = self.read_sql_to_dataframe(query).iloc[:, 0].tolist()
			
 
				+                return tables
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = ("Could not get tablenames"
			
 
				+                       "Finished with error {}".format(e))
			
 
				+
			
 
				+                self._log.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+    def check_if_table_exists(self, tablename: str,
			
 
				+                              schema: str = None,
			
 
				+                              query: str = None):
			
 
				+        '''
			
 
				+        Tries to retrieve table information from database with given query.
			
 
				+        If this does not work, tries to select one row from the given table,
			
 
				+        if this fails, assumes that the table does not exist.
			
 
				+
			
 
				+        :param str tablename:
			
 
				+        :param str schema:
			
 
				+        :param str query: if not specified, tries to find
			
 
				+            tablename in information_schema specified in _db_metadata.
			
 
				+        :return: if the table exists or not
			
 
				+        :rtype: bool
			
 
				+        '''
			
 
				+        if self._is_case_insensitive:
			
 
				+            tablename = tablename.upper()
			
 
				+
			
 
				+        try:
			
 
				+            tablenames = self.get_tablenames(schema=schema, query=query)
			
 
				+
			
 
				+            table_exists = (tablename in tablenames)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._log.warning(('Could not execute query to retrieve table '
			
 
				+                               'information. Trying to execute a'
			
 
				+                               'select statement. '
			
 
				+                               'Got exeption {}').format(e))
			
 
				+            try:
			
 
				+                query = """SELECT *
			
 
				+                           FROM {0}{1}
			
 
				+                           LIMIT 1
			
 
				+                        """.format('' if schema is None else schema + '.',
			
 
				+                                   tablename)
			
 
				+
			
 
				+                self.execute(query)
			
 
				+
			
 
				+                table_exists = True
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                self._log.warning(('Failed to select from {0}. '
			
 
				+                                   'Finished with error {1}'
			
 
				+                                   'Conclusion: table does not exist')
			
 
				+                                  .format(tablename, e))
			
 
				+
			
 
				+                table_exists = False
			
 
				+
			
 
				+        return table_exists
			
 
				+
			
 
				+    def create_schema(self, schema: str, query: str = None):
			
 
				+        '''
			
 
				+        Creates a schema if it does not exist, else does nothing
			
 
				+
			
 
				+        :param str schema:
			
 
				+        :param str query: if None trying to read schemas from
			
 
				+            information_schema specified in db_metadata
			
 
				+        '''
			
 
				+        if (query is None):
			
 
				+
			
 
				+            if self._db_metadata is None:
			
 
				+                raise Exception('Please specify query')
			
 
				+            else:
			
 
				+                query = """SELECT DISTINCT {0}
			
 
				+                           FROM {1}.tables""".format(
			
 
				+                              self._db_metadata['schema_col'],
			
 
				+                              self._db_metadata['information_schema'])
			
 
				+
			
 
				+        try:
			
 
				+            schemas = self.read_sql_to_dataframe(query).iloc[:, 0].tolist()
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the list of schemas"
			
 
				+                   "from the database. Finished with error {}"
			
 
				+                   .format(e))
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        if schema not in schemas:
			
 
				+            self.execute("CREATE SCHEMA {}".format(schema))
			
 
				+
			
 
				+    def drop_table_if_exists(self, tablename: str,
			
 
				+                             schema: str = None,
			
 
				+                             query: str = None):
			
 
				+        '''
			
 
				+        :param str tablename:
			
 
				+        :param str schema:
			
 
				+        :param str query: if not specified, default value is "DROP TABLE"
			
 
				+        '''
			
 
				+        if self._is_case_insensitive:
			
 
				+            tablename = tablename.upper()
			
 
				+
			
 
				+        schema = '' if schema is None else schema + '.'
			
 
				+
			
 
				+        if query is None:
			
 
				+            query = "DROP TABLE {0}{1};".format(schema, tablename)
			
 
				+
			
 
				+        try:
			
 
				+            if self.check_if_table_exists(tablename):
			
 
				+                self.execute(query)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not drop the table {0} ."
			
 
				+                   "Finished with error {1}"
			
 
				+                   .format(tablename, e))
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def get_column_names(self, tablename: str,
			
 
				+                         schema: str = None,
			
 
				+                         query: str = None):
			
 
				+        '''
			
 
				+        Tries to retrieve column information from database with given query.
			
 
				+        If this does not work, tries to select one row from the given table.
			
 
				+
			
 
				+        :param str tablename:
			
 
				+        :param str schema:
			
 
				+        :param str query: if not specified, tries to select column
			
 
				+            names in the information_schema specified in db_metadata
			
 
				+        '''
			
 
				+        if self._is_case_insensitive:
			
 
				+            tablename = tablename.upper()
			
 
				+
			
 
				+        if not self.check_if_table_exists(tablename=tablename,
			
 
				+                                          schema=schema):
			
 
				+
			
 
				+            err = "Table {} does not exist".format(tablename)
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        try:
			
 
				+            if query is None:
			
 
				+                if self._db_metadata is None:
			
 
				+                    raise Exception('Please specify the query')
			
 
				+
			
 
				+                else:
			
 
				+                    schema_or_default_schema =\
			
 
				+                        self._db_metadata['default_schema']\
			
 
				+                        if schema is None else schema
			
 
				+
			
 
				+                    query = """SELECT DISTINCT {0}
			
 
				+                               FROM {1}.columns
			
 
				+                               WHERE {2} = '{3}'
			
 
				+                               AND {4} = '{5}'
			
 
				+                            """.format(
			
 
				+                            self._db_metadata['column_col'],
			
 
				+                            self._db_metadata['information_schema'],
			
 
				+                            self._db_metadata['schema_col'],
			
 
				+                            schema_or_default_schema,
			
 
				+                            self._db_metadata['table_col'],
			
 
				+                            tablename)
			
 
				+
			
 
				+            colnames = [c.lower() for c in
			
 
				+                        self.read_sql_to_dataframe(query).iloc[:, 0].tolist()]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._log.warn((
			
 
				+                'Could not select columns from '
			
 
				+                'informational schema. Trying to '
			
 
				+                'load the table into a dataframe and selec column names.'
			
 
				+                'Obtained exception {}').format(e))
			
 
				+
			
 
				+            query = """SELECT *
			
 
				+                       FROM {0}{1}
			
 
				+                       LIMIT 1
			
 
				+                    """.format('' if schema is None else schema + '.',
			
 
				+                               tablename)
			
 
				+
			
 
				+            data = self.execute(query)
			
 
				+            colnames = data.columns.tolist()
			
 
				+
			
 
				+        return colnames
			
 
				+
			
 
				+    def load_csv_to_db(self, filename: str,
			
 
				+                       tablename: str,
			
 
				+                       schema: str = None,
			
 
				+                       query: str = None,
			
 
				+                       **kwargs):
			
 
				+        '''
			
 
				+        Tries to load data from csv file to database with a given query.
			
 
				+        If this does not work, tries to load data from csv to a
			
 
				+        pandas dataframe first, and then write it to the database.
			
 
				+
			
 
				+        :param str filename:
			
 
				+        :param str tablename:
			
 
				+        :param str schema:
			
 
				+        :param str query: if not specified, tries to use
			
 
				+        LOAD DATA LOCAL INFILE query
			
 
				+        '''
			
 
				+
			
 
				+        if not self.check_if_table_exists(tablename=tablename,
			
 
				+                                          schema=schema):
			
 
				+
			
 
				+            err = ('Table {} test does not exit.'
			
 
				+                   'Please create it first').format(tablename)
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        else:
			
 
				+            try:
			
 
				+                if query is None:
			
 
				+                    query = """LOAD DATA LOCAL INFILE '{0}'
			
 
				+                               INTO TABLE {1}{2}
			
 
				+                               COLUMNS TERMINATED BY ','
			
 
				+                               OPTIONALLY ENCLOSED BY '"'
			
 
				+                               LINES TERMINATED BY '\r\n'
			
 
				+                               IGNORE 1 LINES
			
 
				+                               ({3})
			
 
				+                               ;""".format(
			
 
				+                                   filename,
			
 
				+                                   '' if schema is None else schema + '.',
			
 
				+                                   tablename,
			
 
				+                                   ','.join(self.get_column_names(tablename)))
			
 
				+
			
 
				+                self.execute(query)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = ("Could not load the file {0} "
			
 
				+                       "to the table {1} ."
			
 
				+                       "Finished with error {2}")\
			
 
				+                       .format(filename, tablename, e)
			
 
				+
			
 
				+                self._log.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+    def read_sql_to_dataframe(self, query: str, **read_sql_kwargs):
			
 
				+        '''
			
 
				+        :param str query: normally a SELECT sql statement
			
 
				+        :param read_sql_kwargs: additional arguments to pandas read_sql method
			
 
				+        :return: selected data
			
 
				+        :rtype: DataFrame
			
 
				+        '''
			
 
				+        try:
			
 
				+            connection = self._engine.connect()
			
 
				+
			
 
				+            data = pd.read_sql(sql=query,
			
 
				+                               con=connection,
			
 
				+                               **read_sql_kwargs)
			
 
				+
			
 
				+            connection.close()
			
 
				+            return data
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not read the query to a dataframe. "
			
 
				+                   "Finished with error {}").format(e)
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def read_table(self, tablename: str,
			
 
				+                   schema: str = None,
			
 
				+                   **read_sql_kwargs):
			
 
				+        '''
			
 
				+        :param str tablename:
			
 
				+        :param str schema:
			
 
				+        :param read_sql_kwargs: additional arguments to pands read_sql_method
			
 
				+        :return: selected table
			
 
				+        :rtype: DataFrame
			
 
				+        '''
			
 
				+        schema = '' if schema is None else schema + '.'
			
 
				+
			
 
				+        try:
			
 
				+            return self.read_sql_to_dataframe(
			
 
				+                    query="SELECT * FROM {0}{1};".format(schema, tablename),
			
 
				+                    **read_sql_kwargs)
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not read the table {0} to a dataframe. "
			
 
				+                   "Finished with error {1}").format(tablename, e)
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def append_to_table(self, data: pd.DataFrame,
			
 
				+                        tablename: str,
			
 
				+                        schema: str = None,
			
 
				+                        to_sql_kwargs={'index': False}):
			
 
				+        '''
			
 
				+        :param DataFrame data: data to append
			
 
				+        :param str tablename: table where data is appended
			
 
				+        :param str schema:
			
 
				+        :param dict to_sql_kwargs: additional arguments to pandas to_sql method
			
 
				+        '''
			
 
				+        if schema is not None:
			
 
				+            self.create_schema(schema)
			
 
				+
			
 
				+        try:
			
 
				+            connection = self._engine.connect()
			
 
				+
			
 
				+            data.to_sql(name=tablename,
			
 
				+                        schema=schema,
			
 
				+                        con=connection,
			
 
				+                        if_exists='append',
			
 
				+                        **to_sql_kwargs)
			
 
				+
			
 
				+            connection.close()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could append data to the table {0}. "
			
 
				+                   "Finished with error {1}").format(tablename, e)
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def overwrite_table(self, data: pd.DataFrame,
			
 
				+                        tablename: str,
			
 
				+                        schema: str = None,
			
 
				+                        to_sql_kwargs={'index': False}):
			
 
				+        '''
			
 
				+        :param DataFrame data: data to write to dabase
			
 
				+        :param str tablename: table where data is written
			
 
				+        :param str schema:
			
 
				+        :param to_sql_kwargs: additional arguments to pandas to_sql method
			
 
				+        '''
			
 
				+
			
 
				+        if schema is not None:
			
 
				+            self.create_schema(schema)
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            connection = self._engine.connect()
			
 
				+
			
 
				+            data.to_sql(name=tablename,
			
 
				+                        schema=schema,
			
 
				+                        con=connection,
			
 
				+                        if_exists='replace',
			
 
				+                        **to_sql_kwargs)
			
 
				+
			
 
				+            connection.close()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could overwrite the table {0}. "
			
 
				+                   "Finished with error {1}").format(tablename, e)
			
 
				+
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def draw_er_diagram_from_db(self, diagram_path: str = None,
			
 
				+                                schema: str = None,
			
 
				+                                include_tables: list = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if diagram_path is None:
			
 
				+            diagram_path = "erd.png"
			
 
				+        else:
			
 
				+            diagram_dir = os.path.dirname(diagram_path)
			
 
				+            if diagram_dir != "":
			
 
				+                os.makedirs(diagram_dir, exist_ok=True)
			
 
				+
			
 
				+        import eralchemy
			
 
				+        eralchemy.render_er(self._db_uri,
			
 
				+                            diagram_path,
			
 
				+                            schema=schema,
			
 
				+                            include_tables=include_tables)
			
--- a/db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
+++ b/db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
--- a/db_handlers/__pycache__/SQLHandler.cpython-37.pyc
+++ b/db_handlers/__pycache__/SQLHandler.cpython-37.pyc
--- a/db_handlers/__pycache__/SQLOperations.cpython-37.pyc
+++ b/db_handlers/__pycache__/SQLOperations.cpython-37.pyc
--- a/db_migration/DataFrameToCollection.py
+++ b/db_migration/DataFrameToCollection.py
@@ -0,0 +1,352 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Jul 22 11:05:47 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+
			
 
				+@description: a function to reshape a pandas dataframe to a list of
			
 
				+(possibly nested) documents with respect to a (json) mongodb schema
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class DataFrameToCollection:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, schema_path: str = None, log_path: str = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        from libraries.log import Log
			
 
				+        import json
			
 
				+
			
 
				+        self._log = Log("ParseJsonSchema")
			
 
				+
			
 
				+        if schema_path is not None:
			
 
				+
			
 
				+            if not os.path.isfile(schema_path):
			
 
				+                err = "JsonSchema not found"
			
 
				+                self._log.error(err)
			
 
				+                raise FileNotFoundError(err)
			
 
				+
			
 
				+            # load schema to dictionary if it is a valid json file
			
 
				+            try:
			
 
				+                with open(schema_path, "r") as f:
			
 
				+                    self.schema = json.load(f)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = ("Could not load json schema, "
			
 
				+                       "Obtained error {}".format(e))
			
 
				+
			
 
				+                self._log.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+        else:
			
 
				+            self.schema = None
			
 
				+
			
 
				+    def to_list_of_documents(self, data: pd.DataFrame,
			
 
				+                             grp_fields: list,
			
 
				+                             schema: dict = None,
			
 
				+                             _return_data: bool = False) -> list:
			
 
				+        '''
			
 
				+        Reshapes a pandas dataframe to a list of documents according
			
 
				+         to a complex (json) mongodb schema
			
 
				+
			
 
				+         Remark1: column names of data need to reflect the "nestedness"
			
 
				+         of the field in the mongodb schema with the help of a "." separator
			
 
				+         Example: field.sub_field_1, field.sub_field_2
			
 
				+
			
 
				+         Remark2: if the schema is stored as a json file, first load it
			
 
				+         to a dictionary with the help of the python json module
			
 
				+        '''
			
 
				+        from copy import deepcopy
			
 
				+        from libraries.log import Log
			
 
				+
			
 
				+        log = Log("reshape_dataframe_to_list_of_documents:")
			
 
				+
			
 
				+        data = self._melt_duplicated_columns(data)
			
 
				+
			
 
				+        reshaped_fields = []
			
 
				+
			
 
				+        if schema is None:
			
 
				+            schema = self.schema
			
 
				+
			
 
				+        for field in schema["properties"]:
			
 
				+
			
 
				+            if field not in self._unroll_nested_names(data.columns):
			
 
				+                continue
			
 
				+
			
 
				+            field_type = schema["properties"][field]["bsonType"]
			
 
				+
			
 
				+            # if field has a simple type
			
 
				+            if field_type not in ["array", "object"]:
			
 
				+
			
 
				+                grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				+
			
 
				+                n_distinct_values = data.groupby(grp_fields)[field].nunique()\
			
 
				+                                        .max()
			
 
				+
			
 
				+                if n_distinct_values != 1:
			
 
				+                    err = "Field {0} is not unique with respect to {1}"\
			
 
				+                          .format(field, grp_fields)
			
 
				+
			
 
				+                    log.error(err)
			
 
				+                    raise Exception(err)
			
 
				+
			
 
				+                if field not in grp_fields:
			
 
				+                    reshaped_field = data.groupby(grp_fields)[field].first()
			
 
				+                else:
			
 
				+                    reshaped_field =\
			
 
				+                        data[grp_fields].drop_duplicates()\
			
 
				+                        .set_index(grp_fields, drop=False)[field]
			
 
				+
			
 
				+                reshaped_fields.append(reshaped_field)
			
 
				+
			
 
				+            # if field is sub-document (dictionary)
			
 
				+            elif field_type == "object":
			
 
				+
			
 
				+                sub_schema = deepcopy(schema["properties"][field])
			
 
				+
			
 
				+                # rename sub-schema properties to match with data column names
			
 
				+                sub_schema["properties"] =\
			
 
				+                    {".".join([field, k]): v for k, v
			
 
				+                     in sub_schema["properties"].items()}
			
 
				+
			
 
				+                sub_data = self.to_list_of_documents(
			
 
				+                            data=data,
			
 
				+                            schema=sub_schema,
			
 
				+                            grp_fields=grp_fields,
			
 
				+                            _return_data=True)
			
 
				+
			
 
				+                reshaped_field = sub_data.apply(self._make_dict, axis=1)
			
 
				+                reshaped_field.name = field
			
 
				+
			
 
				+                reshaped_fields.append(reshaped_field)
			
 
				+
			
 
				+            # if field is a list of dictionaries
			
 
				+            elif field_type == "array":
			
 
				+
			
 
				+                items_type = schema["properties"][field]["items"]["bsonType"]
			
 
				+
			
 
				+                if items_type == "object":
			
 
				+
			
 
				+                    sub_schema = deepcopy(schema["properties"][field]["items"])
			
 
				+
			
 
				+                    # rename sub-schema properties to match data column names
			
 
				+                    sub_schema["properties"] =\
			
 
				+                        {".".join([field, k]): v for k, v in
			
 
				+                         sub_schema["properties"].items()}
			
 
				+
			
 
				+                    # extend grp fields by sub-fields of field simple types
			
 
				+                    sub_grp_fields =\
			
 
				+                        [f for f in sub_schema["properties"]
			
 
				+                         if sub_schema["properties"][f]["bsonType"]
			
 
				+                         not in ["array", "object"]]
			
 
				+
			
 
				+                    if len(sub_grp_fields) == 0:
			
 
				+                        err = ("One of the sub-keys in a list of documents"
			
 
				+                               " must be of simple type for the field {}"
			
 
				+                               .format(field))
			
 
				+
			
 
				+                        log.error(err)
			
 
				+                        raise Exception(err)
			
 
				+
			
 
				+                    # group and reshape sub-fields with complex types
			
 
				+                    sub_data = self.to_list_of_documents(
			
 
				+                                data=data,
			
 
				+                                schema=sub_schema,
			
 
				+                                grp_fields=grp_fields + sub_grp_fields,
			
 
				+                                _return_data=True)
			
 
				+
			
 
				+                    if sub_data is not None:
			
 
				+
			
 
				+                        # gether the results into a list of dictionaries
			
 
				+                        sub_data = sub_data.apply(self._make_dict, axis=1)
			
 
				+
			
 
				+                        sub_data.name = field
			
 
				+                        sub_data = sub_data.reset_index(grp_fields)
			
 
				+
			
 
				+                        reshaped_field =\
			
 
				+                            sub_data.groupby(grp_fields)[field]\
			
 
				+                                    .apply(self._make_list_of_distinct)
			
 
				+
			
 
				+                        reshaped_fields.append(reshaped_field)
			
 
				+
			
 
				+                # if field is a list of values with simple type
			
 
				+                else:
			
 
				+
			
 
				+                    grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				+
			
 
				+                    if field in data.columns:
			
 
				+
			
 
				+                        reshaped_field = data.groupby(grp_fields)[field]\
			
 
				+                                           .apply(self._make_list_of_distinct)
			
 
				+
			
 
				+                        reshaped_fields.append(reshaped_field)
			
 
				+
			
 
				+        if len(reshaped_fields) > 0:
			
 
				+            reshaped_data = pd.concat(reshaped_fields, axis=1)
			
 
				+
			
 
				+            if not _return_data:
			
 
				+
			
 
				+                list_of_documents =\
			
 
				+                    reshaped_data.drop(list(reshaped_data.index.names),
			
 
				+                                       axis=1, errors="ignore")\
			
 
				+                                 .reset_index(drop=False)
			
 
				+
			
 
				+                log.info("Done reshaping the dataframe to a list of documents")
			
 
				+
			
 
				+                return list_of_documents
			
 
				+
			
 
				+            else:
			
 
				+
			
 
				+                return reshaped_data
			
 
				+
			
 
				+    def _melt_duplicated_columns(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        for c in set(data.columns):
			
 
				+            if isinstance(data[c], pd.DataFrame):
			
 
				+                data = pd.melt(data, id_vars=[cc for cc in data.columns
			
 
				+                                              if cc != c], value_vars=c)\
			
 
				+                         .drop("variable", axis=1)\
			
 
				+                         .rename(columns={"value": c})
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def _make_dict(self, x: pd.Series) -> dict:
			
 
				+        '''
			
 
				+        return: transforms pandas series to a dictionary
			
 
				+         is meant to be applied to a dataframe in axis = 1,
			
 
				+         then the index of the input series are the column names
			
 
				+         of the dataframe
			
 
				+        '''
			
 
				+        return {f.split(".")[-1]: x[f] for f in x.index}
			
 
				+
			
 
				+    def _make_list(self, x: pd.Series) -> list:
			
 
				+        '''
			
 
				+        return: list of values in a series
			
 
				+        '''
			
 
				+        return list(x)
			
 
				+
			
 
				+    def _make_list_of_distinct(self, x: pd.Series) -> list:
			
 
				+        '''
			
 
				+        return: list of unique values from a Series where
			
 
				+         entries are arbitrary objects
			
 
				+         (pandas unique() method does not work if entries are of complex types)
			
 
				+        '''
			
 
				+        distinct = []
			
 
				+        [distinct.append(obj) for obj in x if obj not in distinct]
			
 
				+        return distinct
			
 
				+
			
 
				+    def _unroll_nested_names(self, columns: list) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        unrolled = []
			
 
				+
			
 
				+        for c in columns:
			
 
				+            splitted = c.split(".")
			
 
				+            for i in range(len(splitted)):
			
 
				+                unrolled.append(".".join(splitted[:i+1]))
			
 
				+
			
 
				+        return unrolled
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    # Testing
			
 
				+
			
 
				+    df = pd.DataFrame({
			
 
				+                       "a": [1]*8 + [2]*8,
			
 
				+                       "b": [10]*8 + [20]*8,
			
 
				+                       "c": [100, 200]*8,
			
 
				+                       "d.da": [11]*8 + [22]*8,
			
 
				+                       "d.db": [33]*8 + [34]*8,
			
 
				+                       "e.ea.eaa": [5]*8 + [55]*8,
			
 
				+                       "e.ea.eab": [6]*8 + [66]*8,
			
 
				+                       "e.eb": [2, 2, 3, 3]*4,
			
 
				+                       "e.ec.eca": [1, 2, 3, 4]*4,
			
 
				+                       "e.ec.ecb": [5, 6, 7, 8]*4,
			
 
				+                       "f.fa": [1]*4 + [3]*4 + [11]*4 + [33]*4,
			
 
				+                       "f.fb": [2]*4 + [3]*2 + [4]*2 + [22]*4 + [44]*4})
			
 
				+
			
 
				+    duplicate = pd.DataFrame({"c": [300, 400]*8})
			
 
				+
			
 
				+    df = pd.concat([df, duplicate], axis=1)
			
 
				+
			
 
				+    schm = {
			
 
				+              "bsonType": "object",
			
 
				+              "required": ["a"],
			
 
				+              "properties": {
			
 
				+
			
 
				+                  "a": {"bsonType": "integer"},
			
 
				+
			
 
				+                  "b": {"bsonType": "integer"},
			
 
				+
			
 
				+                  "c": {
			
 
				+                      "bsonType": "array",
			
 
				+                      "items": {"bsonType": "integer"}
			
 
				+                  },
			
 
				+                  "d": {
			
 
				+                      "bsonType": "object",
			
 
				+                      "properties": {
			
 
				+                          "da": {"bsonType": "integer"},
			
 
				+                          "db": {"bsonType": "integer"}
			
 
				+                       }
			
 
				+                  },
			
 
				+                  "e": {
			
 
				+                      "bsonType": "object",
			
 
				+                      "properties": {
			
 
				+                          "ea": {
			
 
				+                              "bsonType": "object",
			
 
				+                              "properties": {
			
 
				+                                  "eaa": {"bsonType": "integer"},
			
 
				+                                  "eab": {"bsonType": "integer"}
			
 
				+                               }
			
 
				+
			
 
				+                          },
			
 
				+
			
 
				+                          "eb": {
			
 
				+                              "bsonType": "array",
			
 
				+                              "items": {"bsonType": "integer"}
			
 
				+                          },
			
 
				+
			
 
				+                          "ec": {
			
 
				+                                "bsonType": "array",
			
 
				+                                "items": {
			
 
				+                                  "bsonType": "object",
			
 
				+                                  "properties": {
			
 
				+                                      "eca": {"bsonType": "integer"},
			
 
				+                                      "ecb": {"bsonType": "integer"}
			
 
				+                                    }
			
 
				+                                  }
			
 
				+                          }
			
 
				+                      }
			
 
				+                  },
			
 
				+                  "f": {
			
 
				+                      "bsonType": "array",
			
 
				+                      "items": {
			
 
				+                          "bsonType": "object",
			
 
				+                          "properties": {
			
 
				+                              "fa": {"bsonType": "integer"},
			
 
				+                              "fb": {
			
 
				+                                  "bsonType": "array",
			
 
				+                                  "items": {"bsonType": "integer"}
			
 
				+                              }
			
 
				+                          }
			
 
				+                      }
			
 
				+                  }
			
 
				+              }
			
 
				+              }
			
 
				+
			
 
				+    grp_fields = ["a"]
			
 
				+
			
 
				+    result = DataFrameToCollection().to_list_of_documents(
			
 
				+                    data=df,
			
 
				+                    schema=schm,
			
 
				+                    grp_fields=grp_fields)
			
--- a/db_migration/MigrationCleaning.py
+++ b/db_migration/MigrationCleaning.py
@@ -0,0 +1,520 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 25 08:09:52 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import gc
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.db_migration.ParseMapping import ParseMapping
			
 
				+from libraries.db_migration.ParseJsonSchema import ParseJsonSchema
			
 
				+from libraries.utils.ClassLogging import ClassLogging
			
 
				+from libraries.utils.CleaningUtils import CleaningUtils
			
 
				+
			
 
				+
			
 
				+class MigrationCleaning(ClassLogging):
			
 
				+    '''
			
 
				+    Class for correcting and filtering the incorrect data.
			
 
				+    We keep the correcting and the filtering methods separated,
			
 
				+    since there might be other custom steps in between.
			
 
				+    '''
			
 
				+    def __init__(self, mapping_path: str,
			
 
				+                 schema_paths: (str, list),
			
 
				+                 inconsist_report_table: str = None,
			
 
				+                 filter_index_columns: (str, list) = None,
			
 
				+                 mapping_source: str = "internal_name",
			
 
				+                 mapping_target: str = "mongo_name",
			
 
				+                 mapping_parser: type = ParseMapping,
			
 
				+                 schema_parser: type = ParseJsonSchema,
			
 
				+                 log_name: str = "MigrationCleaning"):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(log_name=log_name)
			
 
				+
			
 
				+        assert isinstance(inconsist_report_table, str),\
			
 
				+            "Inconsistent report table should be a tablename string"
			
 
				+
			
 
				+        self._inconsist_report_table = inconsist_report_table
			
 
				+
			
 
				+        assert isinstance(filter_index_columns, (str, list)),\
			
 
				+            "Filter index columns must be a str or a list"
			
 
				+
			
 
				+        self._filter_index_columns = list(filter_index_columns)
			
 
				+
			
 
				+        self._schema_parser = schema_parser(schema_paths)
			
 
				+
			
 
				+        self._mapping_parser = mapping_parser(mapping_path,
			
 
				+                                              source=mapping_source,
			
 
				+                                              target=mapping_target)
			
 
				+
			
 
				+        self._mapping_path = mapping_path
			
 
				+        self._schema_paths = schema_paths
			
 
				+
			
 
				+    def _assert_dataframe_input(self, data: pd.DataFrame):
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(isinstance(data, pd.DataFrame)),\
			
 
				+            "Parameter 'data' must be a pandas dataframe"
			
 
				+
			
 
				+    @property
			
 
				+    def _field_mapping(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._mapping_parser.get_field_mapping()
			
 
				+
			
 
				+    @property
			
 
				+    def _required_fields(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        source_required_fields = self._mapping_parser.get_required_fields()
			
 
				+        target_required_fields = self._schema_parser.get_required_fields()
			
 
				+
			
 
				+        for source_field, target_field in self._field_mapping.items():
			
 
				+
			
 
				+            if (target_field in target_required_fields) and\
			
 
				+                    (source_field not in source_required_fields):
			
 
				+
			
 
				+                source_required_fields.append(source_field)
			
 
				+
			
 
				+        return source_required_fields
			
 
				+
			
 
				+    @property
			
 
				+    def _default_values(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        default_values = {}
			
 
				+
			
 
				+        target_default_values = self._schema_parser.get_default_values()
			
 
				+        source_default_values = self._mapping_parser.get_default_values()
			
 
				+
			
 
				+        for source_field, target_field in self._field_mapping.items():
			
 
				+
			
 
				+            if source_field not in source_default_values:
			
 
				+                continue
			
 
				+
			
 
				+            elif target_field not in target_default_values:
			
 
				+
			
 
				+                target_default_values[target_field] = np.nan
			
 
				+
			
 
				+            default_values[source_field] = {
			
 
				+                    target_default_values[target_field]:
			
 
				+                    source_default_values[source_field]
			
 
				+                    }
			
 
				+
			
 
				+        return default_values
			
 
				+
			
 
				+    @property
			
 
				+    def _python_types(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        target_types = self._schema_parser.get_python_types()
			
 
				+
			
 
				+        result = {}
			
 
				+
			
 
				+        for source_field, target_field in self._field_mapping.items():
			
 
				+
			
 
				+            if target_field in target_types:
			
 
				+                result[source_field] = target_types[target_field]
			
 
				+
			
 
				+            """
			
 
				+            date_type_mismatch =\
			
 
				+                    (target_field in target_types) and\
			
 
				+                    (source_field in source_types) and\
			
 
				+                    (target_types[target_field] == str) and\
			
 
				+                    (source_types[source_field] == np.dtype('<M8[ns]'))
			
 
				+
			
 
				+            if date_type_mismatch:
			
 
				+                target_types[target_field] = np.dtype('<M8[ns]')
			
 
				+
			
 
				+            if (source_field in source_types) and\
			
 
				+                    (target_field in target_types) and\
			
 
				+                    (target_types[target_field] != source_types[source_field]):
			
 
				+
			
 
				+                self.log_and_raise(("Type {0} of field {1} "
			
 
				+                                    "in schema does not match "
			
 
				+                                    "type {2} of field {3} in "
			
 
				+                                    "migration mapping")
			
 
				+                                   .format(target_types[target_field],
			
 
				+                                           target_field,
			
 
				+                                           source_types[source_field],
			
 
				+                                           source_field))
			
 
				+
			
 
				+            if target_field in target_types:
			
 
				+                source_types[source_field] = target_types[target_field]
			
 
				+
			
 
				+            """
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    @property
			
 
				+    def _value_mappings(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._mapping_parser.get_value_mappings()
			
 
				+
			
 
				+    @property
			
 
				+    def _date_formats(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._mapping_parser.get_date_formats()
			
 
				+
			
 
				+    def _get_mongo_schema_info(self, method_name: str):
			
 
				+        '''
			
 
				+        '''
			
 
				+        result = {}
			
 
				+
			
 
				+        target_dict = getattr(self._schema_parser, method_name)()
			
 
				+
			
 
				+        for source_field, target_field in self._field_mapping.items():
			
 
				+
			
 
				+            if target_field in target_dict:
			
 
				+
			
 
				+                result[source_field] = target_dict[target_field]
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    @property
			
 
				+    def _allowed_values(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_mongo_schema_info("get_allowed_values")
			
 
				+
			
 
				+    @property
			
 
				+    def _minimum_values(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_mongo_schema_info("get_minimum_value")
			
 
				+
			
 
				+    @property
			
 
				+    def _maximum_values(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_mongo_schema_info("get_maximum_value")
			
 
				+
			
 
				+    @property
			
 
				+    def _patterns(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_mongo_schema_info("get_patterns")
			
 
				+
			
 
				+    def _filter_invalid_data(self, data: pd.DataFrame,
			
 
				+                             invalid_mask: pd.Series,
			
 
				+                             reason: (str, pd.Series)) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        from libraries.db_handlers.SQLHandler import SQLHandler
			
 
				+
			
 
				+        assert((self._inconsist_report_table is not None) and
			
 
				+               (self._filter_index_columns is not None)),\
			
 
				+            "Inconsistent report table or filter index is not provided"
			
 
				+
			
 
				+        self._assert_dataframe_input(data)
			
 
				+
			
 
				+        data = data.copy(deep=True)
			
 
				+
			
 
				+        db = SQLHandler()
			
 
				+
			
 
				+        if invalid_mask.sum() == 0:
			
 
				+
			
 
				+            return data
			
 
				+
			
 
				+        data_inconsist = data.assign(reason=reason)\
			
 
				+                             .loc[invalid_mask]\
			
 
				+                             .reset_index(drop=True)
			
 
				+
			
 
				+        db.append_to_table(data=data_inconsist,
			
 
				+                           tablename=self._inconsist_report_table)
			
 
				+
			
 
				+        n_rows_filtered = len(data_inconsist)
			
 
				+        n_instances_filtered = len(data_inconsist[self._filter_index_columns].drop_duplicates())
			
 
				+
			
 
				+        del data_inconsist
			
 
				+        gc.collect()
			
 
				+
			
 
				+        self._log.warning(("Filtering: {0} ."
			
 
				+                           "Filtered {1} rows "
			
 
				+                           "and {2} instances"
			
 
				+                           .format(reason, n_rows_filtered, n_instances_filtered)))
			
 
				+
			
 
				+        nok_index_data = data.loc[invalid_mask, self._filter_index_columns]\
			
 
				+                             .drop_duplicates().reset_index(drop=True)
			
 
				+
			
 
				+        nok_index = pd.MultiIndex.from_arrays([nok_index_data[c] for c in
			
 
				+                                               self._filter_index_columns])
			
 
				+
			
 
				+        all_index = pd.MultiIndex.from_arrays([data[c] for c in
			
 
				+                                               self._filter_index_columns])
			
 
				+
			
 
				+        data = data.loc[~all_index.isin(nok_index)].reset_index(drop=True)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def _replace_values(self, data: pd.DataFrame,
			
 
				+                        default: bool) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if default:
			
 
				+            default_str = "default"
			
 
				+        else:
			
 
				+            default_str = "equal"
			
 
				+
			
 
				+        self._assert_dataframe_input(data)
			
 
				+
			
 
				+        data = data.copy(deep=True)
			
 
				+
			
 
				+        if default:
			
 
				+            mapping = self._default_values
			
 
				+        else:
			
 
				+            mapping = self._value_mappings
			
 
				+
			
 
				+        for column, d in mapping.items():
			
 
				+
			
 
				+            try:
			
 
				+
			
 
				+                if column not in data.columns:
			
 
				+                    continue
			
 
				+
			
 
				+                dtype = data[column].dtype
			
 
				+
			
 
				+                for key, values in d.items():
			
 
				+
			
 
				+                    if not default:
			
 
				+
			
 
				+                        mask = (data[column].astype(str).isin(values))
			
 
				+
			
 
				+                    else:
			
 
				+                        mask = (data[column].isin(values))
			
 
				+
			
 
				+                    if default:
			
 
				+
			
 
				+                        mask = mask | (data[column].isnull())
			
 
				+
			
 
				+                    data.loc[mask, column] = key
			
 
				+
			
 
				+                data[column] = data[column].astype(dtype)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+
			
 
				+                self.log_and_raise(("Failed to replace {0} values "
			
 
				+                                    "in {1}. Exit with error {2}"
			
 
				+                                    .format(default_str, column, e)))
			
 
				+
			
 
				+        self._log.info("Replaced {} values".format(default_str))
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def replace_default_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._replace_values(data=data, default=True)
			
 
				+
			
 
				+    def map_equal_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._replace_values(data=data, default=False)
			
 
				+
			
 
				+    def convert_types(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._assert_dataframe_input(data)
			
 
				+
			
 
				+        for column, python_type in self._python_types.items():
			
 
				+
			
 
				+            try:
			
 
				+                if column not in data.columns:
			
 
				+                    continue
			
 
				+
			
 
				+                elif column in self._date_formats:
			
 
				+
			
 
				+                    data[column] = CleaningUtils.convert_dates(
			
 
				+                            series=data[column],
			
 
				+                            formats=self._date_formats[column])
			
 
				+
			
 
				+                elif (python_type == int) and data[column].isnull().any():
			
 
				+
			
 
				+                    self.log_and_raise(("Column {} contains missing values "
			
 
				+                                        "and cannot be of integer type"
			
 
				+                                        .format(column)))
			
 
				+
			
 
				+                elif python_type == str:
			
 
				+
			
 
				+                    python_type = object
			
 
				+
			
 
				+                else:
			
 
				+
			
 
				+                    data[column] = data[column].astype(python_type)
			
 
				+
			
 
				+                if data[column].dtype != python_type:
			
 
				+
			
 
				+                    self._log.warning(("After conversion type in {0} "
			
 
				+                                       "should be {1} "
			
 
				+                                       "but is still {2}"
			
 
				+                                       .format(column,
			
 
				+                                               python_type,
			
 
				+                                               data[column].dtype)))
			
 
				+
			
 
				+            except Exception as e:
			
 
				+
			
 
				+                self.log_and_raise(("Failed to convert types in {0}. "
			
 
				+                                    "Exit with error {1}"
			
 
				+                                    .format(column, e)))
			
 
				+
			
 
				+        self._log.info("Converted dtypes")
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_invalid_null_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._assert_dataframe_input(data)
			
 
				+
			
 
				+        for column in data.columns:
			
 
				+
			
 
				+            if (column in self._required_fields) and\
			
 
				+                    (data[column].isnull().any()):
			
 
				+
			
 
				+                invalid_mask = data[column].isnull()
			
 
				+
			
 
				+                reason = "Null value in the required field {}"\
			
 
				+                         .format(column)
			
 
				+
			
 
				+                data = self._filter_invalid_data(data=data,
			
 
				+                                                 invalid_mask=invalid_mask,
			
 
				+                                                 reason=reason)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_invalid_types(self, data: pd.DataFrame) -> pd.DataFrame():
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._assert_dataframe_input(data)
			
 
				+
			
 
				+        for column, python_type in self._python_types.items():
			
 
				+
			
 
				+            if data[column].dtype != python_type:
			
 
				+
			
 
				+                def mismatch_type(x):
			
 
				+                    return type(x) != python_type
			
 
				+
			
 
				+                invalid_mask = data[column].apply(mismatch_type)
			
 
				+
			
 
				+                reason = "Type mismatch if field {}".format(column)
			
 
				+
			
 
				+                data = self._filter_invalid_data(data=data,
			
 
				+                                                 invalid_mask=invalid_mask,
			
 
				+                                                 reason=reason)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_invalid_patterns(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._assert_dataframe_input(data)
			
 
				+
			
 
				+        for column, pattern in self._patterns:
			
 
				+
			
 
				+            invalid_mask = (~data[column].astype(str).str.match(pattern))
			
 
				+
			
 
				+            reason = "Pattern mismatch in field {}".format(column)
			
 
				+
			
 
				+            data = self._filter_invalid_data(data=data,
			
 
				+                                             invalid_mask=invalid_mask,
			
 
				+                                             reason=reason)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_notallowed_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        for column, value in self._minimum_values.items():
			
 
				+
			
 
				+            invalid_mask = data[column] > value
			
 
				+
			
 
				+            reason = "Too large values in field {}".format(column)
			
 
				+
			
 
				+            data = self._filter_invalid_data(data=data,
			
 
				+                                             invalid_mask=invalid_mask,
			
 
				+                                             reason=reason)
			
 
				+
			
 
				+        for column, value in self._maximum_values.items():
			
 
				+
			
 
				+            invalid_mask = data[column] < value
			
 
				+
			
 
				+            reason = "Too small values in field {}".format(column)
			
 
				+
			
 
				+            data = self._filter_invalid_data(data=data,
			
 
				+                                             invalid_mask=invalid_mask,
			
 
				+                                             reason=reason)
			
 
				+
			
 
				+        for column, allowed_values in self._allowed_values.items():
			
 
				+
			
 
				+            invalid_mask = (~data[column].isin(allowed_values))
			
 
				+
			
 
				+            reason = "Too small values in field {}".format(column)
			
 
				+
			
 
				+            data = self._filter_invalid_data(data=data,
			
 
				+                                             invalid_mask=invalid_mask,
			
 
				+                                             reason=reason)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    # testing
			
 
				+
			
 
				+    from libraries.db_handlers.SQLHandler import SQLHandler
			
 
				+
			
 
				+    mapping_path = os.path.join(".", "migration_mappings", "rs1_mapping.json")
			
 
				+
			
 
				+    schema_paths = [
			
 
				+            os.path.join(".", "mongo_schema", "schema_wheelsets.json"),
			
 
				+            os.path.join(".", "mongo_schema", "schema_process_instances.json")]
			
 
				+
			
 
				+    inconsist_report_table = "test_inconsist_report_rs1"
			
 
				+
			
 
				+    if all([os.path.isfile(p) for p in schema_paths + [mapping_path]]):
			
 
				+
			
 
				+        print("Found schemas!")
			
 
				+
			
 
				+        cleaner = MigrationCleaning(
			
 
				+                mapping_path=mapping_path,
			
 
				+                schema_paths=schema_paths,
			
 
				+                mapping_source="internal_name",
			
 
				+                mapping_target="mongo_name",
			
 
				+                filter_index_columns=["radsatznummer"],
			
 
				+                inconsist_report_table=inconsist_report_table)
			
 
				+
			
 
				+        db = SQLHandler()
			
 
				+
			
 
				+        data = db.read_sql_to_dataframe("select * from rs1 limit 100")
			
 
				+
			
 
				+        data = cleaner.replace_default_values(data)
			
 
				+
			
 
				+        data = cleaner.map_equal_values(data)
			
 
				+
			
 
				+        data = cleaner.convert_types(data)
			
 
				+
			
 
				+        non_filtered_len = len(data)
			
 
				+
			
 
				+        data = cleaner.filter_invalid_types(data)
			
 
				+
			
 
				+        if len(data) < non_filtered_len:
			
 
				+
			
 
				+            data = cleaner.convert_types(data)
			
 
				+
			
 
				+        data = cleaner.filter_invalid_null_values(data)
			
 
				+
			
 
				+        data = cleaner.filter_invalid_patterns(data)
			
 
				+
			
 
				+        data = cleaner.filter_notallowed_values(data)
			
 
				+
			
 
				+    print("Done!")
			
--- a/db_migration/ParseDbSchema.py
+++ b/db_migration/ParseDbSchema.py
@@ -0,0 +1,62 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 25 08:22:20 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import abc
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class ParseDbSchema(metaclass=abc.ABCMeta):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, schema_paths: [list, str], log_file: str = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        from libraries.log import Log
			
 
				+
			
 
				+        self._log = Log(name="ParseDbSchema:", log_file=log_file)
			
 
				+
			
 
				+        if isinstance(schema_paths, str):
			
 
				+            schema_paths = [schema_paths]
			
 
				+
			
 
				+        for schema_path in schema_paths:
			
 
				+            if not os.path.isfile(schema_path):
			
 
				+                err = "Schema not found"
			
 
				+                self._log.error(err)
			
 
				+                raise FileNotFoundError(err)
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_fields(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_datetime_fields(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_python_types(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_default_values(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return
			
 
				+
			
 
				+    @abc.abstractmethod
			
 
				+    def get_allowed_values(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return
			
--- a/db_migration/ParseJsonSchema.py
+++ b/db_migration/ParseJsonSchema.py
@@ -0,0 +1,332 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Jan 31 11:41:48 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from copy import deepcopy
			
 
				+import numpy as np
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.db_migration.ParseDbSchema import ParseDbSchema
			
 
				+
			
 
				+
			
 
				+class ParseJsonSchema(ParseDbSchema):
			
 
				+    '''
			
 
				+    Class for retrieving column properties from mongodb jsonSchema
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self, schema_paths: [list, str], log_file: str = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        import json
			
 
				+        from libraries.log import Log
			
 
				+
			
 
				+        super().__init__(schema_paths=schema_paths, log_file=log_file)
			
 
				+
			
 
				+        self._log = Log(name="ParseJsonSchema", log_file=log_file)
			
 
				+
			
 
				+        # load schemas to dictionaries if they are valid json files
			
 
				+
			
 
				+        assert(isinstance(schema_paths, (list, str))),\
			
 
				+            "Schema paths must be either str or lists"
			
 
				+
			
 
				+        if isinstance(schema_paths, str):
			
 
				+            schema_paths = [schema_paths]
			
 
				+
			
 
				+        self.schemas = []
			
 
				+
			
 
				+        for schema_path in schema_paths:
			
 
				+            try:
			
 
				+                with open(schema_path, "r") as f:
			
 
				+                    self.schemas.append(json.load(f))
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = ("Could not load json schema, "
			
 
				+                       "Obtained error {}".format(e))
			
 
				+
			
 
				+                self._log.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+    def get_fields(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse()
			
 
				+
			
 
				+    def get_required_fields(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(required_only=True)
			
 
				+
			
 
				+    def get_mongo_types(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="bsonType")
			
 
				+
			
 
				+    def get_datetime_fields(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        mongo_types = self.get_mongo_types()
			
 
				+
			
 
				+        return [k for k, v in mongo_types.items()
			
 
				+                if v in ["date", "timestamp", "Date", "Timestamp"]]
			
 
				+
			
 
				+    def get_python_types(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        mongo_types = self.get_mongo_types()
			
 
				+        python_types = {}
			
 
				+
			
 
				+        bson_to_python_types_except_dates = {"double": float,
			
 
				+                                             "decimal": float,
			
 
				+                                             "string": str,
			
 
				+                                             "object": object,
			
 
				+                                             "array": list,
			
 
				+                                             "bool": bool,
			
 
				+                                             "int": int,
			
 
				+                                             "long": int,
			
 
				+                                             "date": np.dtype('<M8[ns]'),
			
 
				+                                             "timestamp": np.dtype('<M8[ns]')
			
 
				+                                             }
			
 
				+
			
 
				+        for k, v in mongo_types.items():
			
 
				+
			
 
				+            if isinstance(v, list):
			
 
				+                if ("date" in v) or ("timestamp" in v):
			
 
				+                    v = "date"
			
 
				+                elif "string" in v:
			
 
				+                    v = "string"
			
 
				+                elif ("double" in v) or ("decimal" in v):
			
 
				+                    v = "double"
			
 
				+                elif ("null" in v) and (len(v) == 2) and ("int" not in v):
			
 
				+                    v = [t for t in v if type != "null"][0]
			
 
				+                else:
			
 
				+                    err = "Type {0}: {1} not convertibale".format(k, v)
			
 
				+                    self._log.error(err)
			
 
				+                    raise Exception(err)
			
 
				+
			
 
				+            if v in bson_to_python_types_except_dates:
			
 
				+                python_types[k] = bson_to_python_types_except_dates[v]
			
 
				+
			
 
				+        return python_types
			
 
				+
			
 
				+    def get_patterns(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="pattern")
			
 
				+
			
 
				+    def get_default_values(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="default")
			
 
				+
			
 
				+    def get_allowed_values(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="enum")
			
 
				+
			
 
				+    def get_maximum_value(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="maximum")
			
 
				+
			
 
				+    def get_minimum_value(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="minimum")
			
 
				+
			
 
				+    def get_max_items(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="maxItems")
			
 
				+
			
 
				+    def get_min_items(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="minItems")
			
 
				+
			
 
				+    def get_field_descriptions(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._parse(field_info="description")
			
 
				+
			
 
				+    def _parse(self,
			
 
				+               field_info: str = None,
			
 
				+               required_only: bool = False):
			
 
				+        '''
			
 
				+        '''
			
 
				+        result = self._parse_one(schema=self.schemas[0],
			
 
				+                                 field_info=field_info,
			
 
				+                                 required_only=required_only)
			
 
				+
			
 
				+        for schema in self.schemas[1:]:
			
 
				+
			
 
				+            next_result = self._parse_one(schema=schema,
			
 
				+                                          field_info=field_info,
			
 
				+                                          required_only=required_only)
			
 
				+
			
 
				+            if isinstance(result, list):
			
 
				+                result.extend(next_result)
			
 
				+            else:
			
 
				+                result.update(next_result)
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				+    def _parse_one(self,
			
 
				+                   schema: dict,
			
 
				+                   field_info: str = None,
			
 
				+                   required_only: bool = False,
			
 
				+                   super_field_name: str = None,
			
 
				+                   already_parsed: (list, dict) = None) -> (list, dict):
			
 
				+        '''
			
 
				+        Recursive function that returns a list of (nested) field names or
			
 
				+        a dictionary of (nested) field names with field characteristics.
			
 
				+
			
 
				+        :param schema: if None => entire self.schema, or a sub-schema
			
 
				+            of self.schema
			
 
				+
			
 
				+        :param field_info: optional, if provided a dictionary of field
			
 
				+            names with field characteristics is returned (for examples
			
 
				+            bsonType of each field), else a list of fields is returned
			
 
				+
			
 
				+        :param required_only: when True, only returns fields marked as
			
 
				+            required in the mongo schema
			
 
				+
			
 
				+        :param super_field_name: needed for recursion
			
 
				+            Example: the field 'article' has
			
 
				+            subfields 'id' and 'supplier'.
			
 
				+            If we parse the sub-document corresponding to article, then
			
 
				+            super_field_name is'article' and we might get an output like
			
 
				+            {'article.id': string, 'article.supplier': string}
			
 
				+
			
 
				+        :param alread_parsed: needed for recursion
			
 
				+
			
 
				+        '''
			
 
				+        schema = deepcopy(schema)
			
 
				+
			
 
				+        assert(isinstance(schema, dict)),\
			
 
				+            "Parameter 'schema' must be a dict"
			
 
				+
			
 
				+        if field_info is None:
			
 
				+            # parse a list of fields
			
 
				+            if already_parsed is None:
			
 
				+                already_parsed = []
			
 
				+            else:
			
 
				+                assert(isinstance(already_parsed, list)),\
			
 
				+                    "Parameter 'already_parsed' must be of type list"
			
 
				+        else:
			
 
				+            # parse a dictionary of field names with field characteristics
			
 
				+            if already_parsed is None:
			
 
				+                already_parsed = {}
			
 
				+            else:
			
 
				+                assert(isinstance(already_parsed, dict)),\
			
 
				+                    "Parameter 'already_parsed' must be of type dict"
			
 
				+
			
 
				+        # If schema is nested, then
			
 
				+        # either it is of bsonType object
			
 
				+        # and the field information is stored under the key 'properties'
			
 
				+        # or it is of bsonType array
			
 
				+        # and the field information is stored in sub-schemas
			
 
				+        # under the key 'items'
			
 
				+
			
 
				+        # if schema is of bsonType object
			
 
				+        if "properties" in schema.keys():
			
 
				+            if "required" in schema.keys():
			
 
				+                required_subfields = schema["required"]
			
 
				+
			
 
				+            for sub_field_name in schema["properties"].keys():
			
 
				+
			
 
				+                sub_schema = schema["properties"][sub_field_name]
			
 
				+
			
 
				+                # only process fields that are required
			
 
				+                if required_only and\
			
 
				+                        (sub_field_name not in required_subfields):
			
 
				+                    pass
			
 
				+                else:
			
 
				+                    if super_field_name is not None:
			
 
				+                        field_name = '.'.join([super_field_name,
			
 
				+                                               sub_field_name])
			
 
				+                    else:
			
 
				+                        field_name = sub_field_name
			
 
				+
			
 
				+                    # if the given sub-field is nested, parse the
			
 
				+                    # sub-schema corresponding to this sub-field
			
 
				+                    self._parse_one(
			
 
				+                            schema=sub_schema,
			
 
				+                            super_field_name=field_name,
			
 
				+                            field_info=field_info,
			
 
				+                            already_parsed=already_parsed,
			
 
				+                            required_only=required_only)
			
 
				+
			
 
				+        # if schema is of bsonType array
			
 
				+        elif "items" in schema.keys():
			
 
				+            # one schema for all items
			
 
				+            if isinstance(schema["items"], dict):
			
 
				+
			
 
				+                sub_schema = schema["items"]
			
 
				+
			
 
				+                self._parse_one(schema=sub_schema,
			
 
				+                                super_field_name=super_field_name,
			
 
				+                                field_info=field_info,
			
 
				+                                already_parsed=already_parsed,
			
 
				+                                required_only=required_only)
			
 
				+
			
 
				+            # list of separate schemas for each item
			
 
				+            elif isinstance(schema["items"], list):
			
 
				+
			
 
				+                for sub_schema in schema["items"]:
			
 
				+                    self._parse_one(schema=sub_schema,
			
 
				+                                    super_field_name=super_field_name,
			
 
				+                                    field_info=field_info,
			
 
				+                                    already_parsed=already_parsed,
			
 
				+                                    required_only=required_only)
			
 
				+            else:
			
 
				+                raise Exception(('Schema is not composed correctly: '
			
 
				+                                 'items must be a dictionary or a list'))
			
 
				+        else:
			
 
				+            # If neither properties nor items is in schema keys
			
 
				+            # we reached the last level of nestedness,
			
 
				+            # field information is stored in the schema keys.
			
 
				+            field_name = super_field_name
			
 
				+
			
 
				+            if field_info is None:
			
 
				+                already_parsed.append(field_name)
			
 
				+            else:
			
 
				+                if field_info in schema.keys():
			
 
				+                    already_parsed[field_name] = schema[field_info]
			
 
				+                else:
			
 
				+                    pass
			
 
				+
			
 
				+        return already_parsed
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    # Only for testing
			
 
				+
			
 
				+    schema_path = os.path.join(".", "mongo_schema", "schema_wheelsets.json")
			
 
				+
			
 
				+    if os.path.isfile(schema_path):
			
 
				+
			
 
				+        parse_obj = ParseJsonSchema(schema_paths=schema_path)
			
 
				+
			
 
				+        fields = parse_obj.get_fields()
			
 
				+
			
 
				+        required_fileds = parse_obj.get_required_fields()
			
 
				+
			
 
				+        patterns = parse_obj.get_patterns()
			
 
				+
			
 
				+        mongo_types = parse_obj.get_mongo_types()
			
 
				+
			
 
				+        python_types_except_dates = parse_obj.get_python_types()
			
 
				+
			
 
				+        datetime_fields = parse_obj.get_datetime_fields()
			
 
				+
			
 
				+        allowed_values = parse_obj.get_allowed_values()
			
 
				+
			
 
				+        descriptions = parse_obj.get_field_descriptions()
			
--- a/db_migration/ParseMapping.py
+++ b/db_migration/ParseMapping.py
@@ -0,0 +1,157 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Sep 20 15:33:17 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import numpy as np
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class ParseMapping:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, mapping_path: str, log_name: str = "ParseMapping",
			
 
				+                 source: str = "original_name", target: str = "original_name"):
			
 
				+        '''
			
 
				+        '''
			
 
				+        import json
			
 
				+        from libraries.log import Log
			
 
				+
			
 
				+        self._log = Log(log_name)
			
 
				+
			
 
				+        if not os.path.isfile(mapping_path):
			
 
				+            err = "Mapping not found"
			
 
				+            self._log.error(err)
			
 
				+            raise FileNotFoundError(err)
			
 
				+
			
 
				+        try:
			
 
				+            with open(mapping_path, "r") as f:
			
 
				+                self._mapping = json.load(f)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not load mapping. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        self._source = source
			
 
				+        self._target = target
			
 
				+
			
 
				+    def get_field_mapping(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(all([set([self._source, self._target]) <= set(d)
			
 
				+                    for d in self._mapping]))
			
 
				+
			
 
				+        return {d[self._source]: d[self._target] for d in self._mapping}
			
 
				+
			
 
				+    def _get_fields_satistisfying_condition(self, key: str, value) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(all([self._source in d for d in self._mapping])),\
			
 
				+            "Invalid from field"
			
 
				+
			
 
				+        return [d[self._source] for d in self._mapping
			
 
				+                if (key in d) and (d[key] == value)]
			
 
				+
			
 
				+    def get_required_fields(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_fields_satistisfying_condition(key="required",
			
 
				+                                                        value=1)
			
 
				+
			
 
				+    def get_date_fields(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_fields_satistisfying_condition(key="type",
			
 
				+                                                        value="Date")
			
 
				+
			
 
				+    def _get_info(self, key: str, value=None) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(all([self._source in d for d in self._mapping])),\
			
 
				+            "Invalid from field"
			
 
				+
			
 
				+        return {d[self._source]: d[key] for d in self._mapping
			
 
				+                if (key in d) and ((value is not None)
			
 
				+                and (d[key] == value)) or (key in d)}
			
 
				+
			
 
				+    def get_default_values(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_info(key="default_values")
			
 
				+
			
 
				+    def get_date_formats(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_info(key="date_format")
			
 
				+
			
 
				+    def get_types(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_info(key="type")
			
 
				+
			
 
				+    def get_python_types(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        sql_to_python_dtypes = {
			
 
				+                "Text": str,
			
 
				+                "Date": np.dtype('<M8[ns]'),
			
 
				+                "Double": float,
			
 
				+                "Integer": int
			
 
				+                }
			
 
				+
			
 
				+        sql_types = self.get_types()
			
 
				+
			
 
				+        return {k: sql_to_python_dtypes[v] for k, v in sql_types.items()}
			
 
				+
			
 
				+    def get_value_mappings(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._get_info(key="value_mapping")
			
 
				+
			
 
				+    def get_column_numbers(self) -> list:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if all(["column_number" in d for d in self._mapping]):
			
 
				+            column_numbers = [d["column_number"] for d in self._mapping]
			
 
				+
			
 
				+        elif all(["column_number" not in d for d in self._mapping]):
			
 
				+            column_numbers = list(range(len(self._mapping)))
			
 
				+
			
 
				+        else:
			
 
				+            err = ("Incorrectly filled mapping. Column numbers should ",
			
 
				+                   "either in all or in neither of the fields")
			
 
				+            self.log.err(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        return column_numbers
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    mapping_path = os.path.join(".", "migration_mappings", "rs0_mapping.json")
			
 
				+
			
 
				+    if os.path.isfile(mapping_path):
			
 
				+
			
 
				+        print("found mapping path")
			
 
				+
			
 
				+        parser = ParseMapping(mapping_path, source="internal_name",
			
 
				+                              target="mongo_name")
			
 
				+
			
 
				+        internal_to_mongo_mapping = parser.get_field_mapping()
			
 
				+
			
 
				+        original_to_internal_mapping = parser.get_field_mapping()
			
 
				+
			
 
				+        default_values = parser.get_default_values()
			
 
				+
			
 
				+        types = parser.get_types()
			
 
				+
			
 
				+        column_numbers = parser.get_column_numbers()
			
 
				+
			
 
				+        print("Done testing!")
			
--- a/db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
+++ b/db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
--- a/db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
+++ b/db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
--- a/db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
+++ b/db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
--- a/db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
+++ b/db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
--- a/db_migration/__pycache__/ParseMapping.cpython-37.pyc
+++ b/db_migration/__pycache__/ParseMapping.cpython-37.pyc
--- a/hyperopt/HyperoptPipelineSelection.py
+++ b/hyperopt/HyperoptPipelineSelection.py
@@ -0,0 +1,798 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Nov  9 13:27:44 2018
			
 
				+
			
 
				+@author: tanja
			
 
				+@description: Implementation of machine learning
			
 
				+                pipeline selection and tuning with hyperopt library
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import gc
			
 
				+import logging
			
 
				+import pickle
			
 
				+import time
			
 
				+import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+
			
 
				+from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
			
 
				+    space_eval, pyll
			
 
				+
			
 
				+from sklearn.model_selection import cross_validate
			
 
				+
			
 
				+
			
 
				+class HyperoptPipelineSelection:
			
 
				+    '''
			
 
				+    Use this class to perform a search
			
 
				+    for a machine learning pipeline in a given parameter space.
			
 
				+    The parameter space can include multiple types of Pipelines
			
 
				+    (SVM, XGBOOST, random forest, etc),
			
 
				+    as well as parameter distributions for each pipeline parameter.
			
 
				+    See example in main for the expected space structure.
			
 
				+
			
 
				+    The search can be performed either randomly
			
 
				+    or with a tree-based algorithm. (Other methods are currently
			
 
				+    developped by hyperopt creators).
			
 
				+
			
 
				+    Attribute trials is responsible for book-keeping parameter
			
 
				+    combinations that have already been tried out. This attribute
			
 
				+    is saved to a binary file every n minutes as well as every time
			
 
				+    a better pipeline was found.
			
 
				+    '''
			
 
				+    def __init__(self,
			
 
				+                 cost_func,
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: int = 1,
			
 
				+                 log_path: str = None,
			
 
				+                 averaging_func: callable = None):
			
 
				+        '''
			
 
				+        :param callable cost_func: function to minimize or maximize
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from
			
 
				+            the beginning.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+
			
 
				+        :param str log_path: Optional, when not provided logs to stdout.
			
 
				+
			
 
				+        :param callable averaging_func: optional,
			
 
				+            when not provided set to mean. Function
			
 
				+            to aggregate the cross-validated values of the cost function.
			
 
				+            Classic situation is to take the mean,
			
 
				+            another example is, for example mean() - c*var().
			
 
				+        '''
			
 
				+
			
 
				+        assert(callable(cost_func)),\
			
 
				+            "Parameter 'cost_func' must be a callable"
			
 
				+
			
 
				+        assert(isinstance(greater_is_better, bool)),\
			
 
				+            "Parameter 'greater_is_better' must be bool type"
			
 
				+
			
 
				+        assert(isinstance(trials_path, str)),\
			
 
				+            "Parameter 'trials_path' must be of string type"
			
 
				+
			
 
				+        if averaging_func is not None:
			
 
				+            assert(callable(averaging_func)),\
			
 
				+                "Parameter 'averaging_func' must be a callable"
			
 
				+
			
 
				+        self._assert_valid_directory(path=trials_path)
			
 
				+
			
 
				+        self._configer_logger(log_path)
			
 
				+
			
 
				+        self._cost_func = cost_func
			
 
				+        # is 1 when cost_func is minimized, -1 when cost func is maximized
			
 
				+        self._score_factor = (not greater_is_better) - greater_is_better
			
 
				+        self._trials_path = trials_path
			
 
				+        # is initialized with empty trials object
			
 
				+        self._trials = Trials()
			
 
				+        self._backup_trials_freq = backup_trials_freq
			
 
				+        self._averaging_func = averaging_func or np.mean
			
 
				+        # keeping track of the current search iteration
			
 
				+        self._run_number = 0
			
 
				+        # space and data need to be attached to perform search.
			
 
				+        self._space_attached = False
			
 
				+        self._data_attached = False
			
 
				+
			
 
				+        # if a trials object already exists at the given path,
			
 
				+        # it is loaded and the search is continued. Else,
			
 
				+        # the search is started from the beginning.
			
 
				+        if os.path.isfile(trials_path):
			
 
				+            try:
			
 
				+                with open(trials_path, "rb") as f:
			
 
				+                    self._trials = pickle.load(f)
			
 
				+
			
 
				+                self._logger.info(("Loaded an existing trials object"
			
 
				+                                   "Consisting of {} trials")
			
 
				+                                  .format(len(self._trials.trials)))
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                self._logger.error(("Trials object could not be loaded. "
			
 
				+                                    "Training starts from the beginning. "
			
 
				+                                    "Exit with error {}").format(e))
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.info(("No existing trials object was found"
			
 
				+                               "Initialized an empty trials object."))
			
 
				+
			
 
				+        self._best_score = self.best_trial_score
			
 
				+
			
 
				+    def _configer_logger(self, log_path: str = None):
			
 
				+        '''
			
 
				+        Can be replaced with the existing script later.
			
 
				+        When log_path is not provided, logs to stdout.
			
 
				+        '''
			
 
				+
			
 
				+        self._logger = logging.getLogger(__name__)
			
 
				+
			
 
				+        if (self._logger.hasHandlers()):
			
 
				+            self._logger.handlers.clear()
			
 
				+
			
 
				+        if log_path is not None:
			
 
				+            assert(isinstance(log_path, str)),\
			
 
				+                "Parameter 'log_path' must be of string type"
			
 
				+            self._assert_valid_directory(log_path)
			
 
				+
			
 
				+            handler = logging.FileHandler(log_path)
			
 
				+        else:
			
 
				+            handler = logging.StreamHandler(sys.stdout)
			
 
				+
			
 
				+        formatter = logging.Formatter(
			
 
				+                '\n %(asctime)s %(levelname)s %(message)s')
			
 
				+
			
 
				+        handler.setFormatter(formatter)
			
 
				+        self._logger.addHandler(handler)
			
 
				+        self._logger.setLevel("INFO")
			
 
				+
			
 
				+    def _backup_trials(self):
			
 
				+        '''
			
 
				+        Pickles (Saves) the trials object.
			
 
				+        Used in a scheduler.
			
 
				+        '''
			
 
				+        with open(self._trials_path, "wb") as f:
			
 
				+            pickle.dump(self._trials, f)
			
 
				+
			
 
				+    def _assert_valid_directory(self, path: str):
			
 
				+        '''
			
 
				+        If the directory of a path does not exist yet,
			
 
				+        creates it.
			
 
				+        '''
			
 
				+        assert(isinstance(path, str)),\
			
 
				+            "Parameter 'path' must of str type"
			
 
				+
			
 
				+        dirname = os.path.dirname("path")
			
 
				+
			
 
				+        if len(dirname) > 0:
			
 
				+            os.mkdir(dirname, exists_ok=True)
			
 
				+
			
 
				+    def attach_space(self, space: pyll.base.Apply = None,
			
 
				+                     module_path: str = None,
			
 
				+                     name: str = None):
			
 
				+        '''
			
 
				+        :param pyll.base.Apply space: hyperopt space where
			
 
				+            the search is performed. Optional when a space
			
 
				+            is loaded from a python module.
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the space is defined. Optional when
			
 
				+            the space is provided directly.
			
 
				+
			
 
				+        :param str name: name of the space loaded from
			
 
				+            a python module. Optional when the space
			
 
				+            is provided directly.
			
 
				+        '''
			
 
				+        assert((space is not None) or
			
 
				+               ((module_path is not None) and (name is not None))),\
			
 
				+            "Either space or (module_path, name) must be provided"
			
 
				+
			
 
				+        if space is None:
			
 
				+            for p in ["modele_path", "name"]:
			
 
				+                assert(isinstance(p, str)),\
			
 
				+                    "Parameter '{}' must be of str type".format(p)
			
 
				+
			
 
				+            assert(os.path.isfile(module_path)),\
			
 
				+                "Parameter 'module_path' must be a valid file"
			
 
				+
			
 
				+            module, extension = os.path.splitext(os.path.basename(module_path))
			
 
				+            assert(extension == ",py"),\
			
 
				+                "Parameter 'space' must be read from a python file"
			
 
				+
			
 
				+            sys.path.insert(module_path)
			
 
				+
			
 
				+            try:
			
 
				+                from module import name as space
			
 
				+            except ImportError:
			
 
				+                err = "Invalid space location or name"
			
 
				+                self._logger.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+        assert(isinstance(space, pyll.base.Apply)),\
			
 
				+            "Parameter 'space' must be of hyperopt space type"
			
 
				+
			
 
				+        self._space = space
			
 
				+        self._logger.info("Attached parameter distribution space")
			
 
				+        self._space_attached = True
			
 
				+
			
 
				+    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
			
 
				+            -> np.ndarray:
			
 
				+        '''
			
 
				+        Converts an DataFrame to an numpy array.
			
 
				+        '''
			
 
				+        if isinstance(x, np.ndarray):
			
 
				+            return x
			
 
				+
			
 
				+        elif (isinstance(x, pd.core.frame.DataFrame))\
			
 
				+                or (isinstance(x, pd.core.series.Series)):
			
 
				+            return x.values
			
 
				+
			
 
				+        else:
			
 
				+            e = 'The argument must be a numpy array or a pandas DataFrame'
			
 
				+            self._logger.critical(e)
			
 
				+            raise ValueError(e)
			
 
				+
			
 
				+    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
			
 
				+                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				+                    X_val: (pd.DataFrame, np.ndarray) = None,
			
 
				+                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				+                    cv: (list, int) = None):
			
 
				+        '''
			
 
				+        :param array X_train: data on which
			
 
				+            machine learning pipelines are trained
			
 
				+
			
 
				+        :param array y_train: optional, vector with targets,
			
 
				+            (not all algorithms require a targets)
			
 
				+
			
 
				+        :param array X_val: optional, validation data.
			
 
				+            When not provided, cross-validated value
			
 
				+            of the cost_func is calculated.
			
 
				+
			
 
				+        :param array y_val: optional, validation targets
			
 
				+
			
 
				+        :param list cv: list of tuples containing
			
 
				+            train and validation indices or an integer representing
			
 
				+            the number of folds for a random split of data
			
 
				+            during cross-validation
			
 
				+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
			
 
				+        '''
			
 
				+
			
 
				+        X_train = self._convert_to_array(X_train)
			
 
				+        if y_train is not None:
			
 
				+            y_train = self._convert_to_array(y_train)
			
 
				+
			
 
				+        if X_val is not None:
			
 
				+            if cv is not None:
			
 
				+                self._logger.warning(("Both validation set and cv object "
			
 
				+                                      "are set. Validation score will be "
			
 
				+                                      "calculated on the validation set!"))
			
 
				+
			
 
				+            X_val = self._convert_to_array(X_val)
			
 
				+
			
 
				+            train_inds = list(range(len(X_train)))
			
 
				+            val_inds = list(range(len(X_train),
			
 
				+                                  len(X_train) + len(X_val)))
			
 
				+
			
 
				+            # cost is evaluated with a cross validation function
			
 
				+            # that accepts an array and a cv object with
			
 
				+            # indices of the fold splits.
			
 
				+            # Here we create a trivial cv object
			
 
				+            # with one validation split.
			
 
				+            self._cv = [(train_inds, val_inds)]
			
 
				+            self._X = np.concatenate([X_train, X_val])
			
 
				+
			
 
				+            if y_train is not None:
			
 
				+                if y_val is None:
			
 
				+                    err = "Argument y_val must be provided"
			
 
				+                    self._logger.critical(err)
			
 
				+                    raise ValueError(err)
			
 
				+                else:
			
 
				+                    y_val = self._convert_to_array(y_val)
			
 
				+                    self._y = np.concatenate([y_train, y_val])
			
 
				+            else:
			
 
				+                self._y = None
			
 
				+        else:
			
 
				+            if cv is None:
			
 
				+                self._logger.warning(("Neither validation set nor cv object "
			
 
				+                                      "are set. Validation score will be "
			
 
				+                                      "calculated on 5 randomly "
			
 
				+                                      "splitted folds."))
			
 
				+
			
 
				+            self._X = X_train
			
 
				+            self._y = y_train
			
 
				+            self._cv = cv
			
 
				+
			
 
				+        self._logger.info("Attached data")
			
 
				+        self._data_attached = True
			
 
				+
			
 
				+    def _evaluate(self, pipeline: Pipeline) -> dict:
			
 
				+        '''
			
 
				+        This method is called in _objective.
			
 
				+
			
 
				+        Calculates the cost on the attached data.
			
 
				+        This function can be overriden, when the cost
			
 
				+        needs to be calculated differently,
			
 
				+        for example with a tensorflow model.
			
 
				+
			
 
				+        :param Pipeline pipeline: machine learning pipeline
			
 
				+            that will be evaluated with cross-validation
			
 
				+
			
 
				+        :output: dictionary with the aggregated
			
 
				+            cross-validation score and
			
 
				+            the score variance.
			
 
				+        '''
			
 
				+
			
 
				+        scores = cross_validate(estimator=pipeline,
			
 
				+                                X=self._X,
			
 
				+                                y=self._y,
			
 
				+                                cv=self._cv or 5,
			
 
				+                                scoring=make_scorer(self._cost_func),
			
 
				+                                error_score=np.nan)
			
 
				+
			
 
				+        return {'value': self._averaging_func(scores['test_score']),
			
 
				+                'variance': np.var(scores['test_score'])}
			
 
				+
			
 
				+    def _objective(self, space_element: dict) -> dict:
			
 
				+        '''
			
 
				+        This method is called in search_for_best_pipeline
			
 
				+        inside the hyperopt fmin method.
			
 
				+
			
 
				+        Uses _evaluate method.
			
 
				+
			
 
				+        It must take as input a space element
			
 
				+        and produce an output in the form of dictionary
			
 
				+        with 2 obligatory values loss and status
			
 
				+        (STATUS_OK or STATUS_FAIL). Other
			
 
				+        values in the output are optional and can be
			
 
				+        accessed later through the trials object.
			
 
				+
			
 
				+        :Warning: fmin minimizes the loss,
			
 
				+        when _evaluate returns a value to be maximized,
			
 
				+        it should be multiplied by -1 to obtain loss.
			
 
				+
			
 
				+        :param dict space_element: must contain keys
			
 
				+            name (with the name of the pipeline),
			
 
				+            pipeline (Pipeline object),
			
 
				+            params (dict of pipeline params)
			
 
				+
			
 
				+        :output: dictionary with keys
			
 
				+            loss (minimized value),
			
 
				+            status with values STATUS_OK or STATUS_FAIL
			
 
				+            uderstood by hyperopt,
			
 
				+            score (equal to loss or -loss),
			
 
				+            score_variance,
			
 
				+            timestamp (end of execution),
			
 
				+            train_time: execution time
			
 
				+        '''
			
 
				+        assert(isinstance(space_element, dict) and
			
 
				+               set(['name', 'pipeline', 'params']) <= space_element.keys())
			
 
				+
			
 
				+        assert(isinstance(space_element['name'], str) and
			
 
				+               isinstance(space_element['pipeline'], Pipeline) and
			
 
				+               isinstance(space_element['params'], dict))
			
 
				+
			
 
				+        start_time = time.time()
			
 
				+
			
 
				+        if not self._data_attached:
			
 
				+            raise Exception(("Data must be attached in order "
			
 
				+                             "in order to effectuate the best"
			
 
				+                             "pipeline search"))
			
 
				+
			
 
				+        self._run_number += 1
			
 
				+
			
 
				+        pipeline = space_element['pipeline']
			
 
				+        params = space_element['params']
			
 
				+        pipeline.set_params(**params)
			
 
				+
			
 
				+        self._logger.info(("Run number {0}: "
			
 
				+                           "Current score is {1}: "
			
 
				+                           "Training pipeline {2} "
			
 
				+                           "with parameters: {3}. ").format(
			
 
				+                             self._run_number,
			
 
				+                             self._best_score,
			
 
				+                             space_element['name'],
			
 
				+                             params))
			
 
				+
			
 
				+        try:
			
 
				+            score_stats = self._evaluate(pipeline)
			
 
				+            assert(not np.isnan(score_stats["value"])),\
			
 
				+                "Returned null score"
			
 
				+
			
 
				+            if self._run_number % self._backup_trials_freq == 0:
			
 
				+                self._backup_trials()
			
 
				+
			
 
				+            if (self._best_score != self._best_score) or\
			
 
				+                self._score_factor*score_stats["value"] <\
			
 
				+                    self._score_factor*self._best_score:
			
 
				+
			
 
				+                self._logger.info("Score got better, new best score is: {}"
			
 
				+                                  .format(score_stats["value"]))
			
 
				+
			
 
				+                self._best_score = score_stats['value']
			
 
				+
			
 
				+                self._backup_trials()
			
 
				+
			
 
				+            end_time = time.time()
			
 
				+
			
 
				+            return {'loss': self._score_factor * score_stats["value"],
			
 
				+                    'status': STATUS_OK,
			
 
				+                    'score': score_stats["value"],
			
 
				+                    'score_variance': score_stats["variance"],
			
 
				+                    'timestamp': datetime.datetime.today(),
			
 
				+                    'train_time': end_time - start_time}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+
			
 
				+            self._logger.warning("Trial failed with error {}".format(e))
			
 
				+
			
 
				+            return {'loss': np.nan,
			
 
				+                    'status': STATUS_FAIL,
			
 
				+                    'score': np.nan,
			
 
				+                    'score_variance': np.nan,
			
 
				+                    'timestamp': datetime.datetime.today(),
			
 
				+                    'train_time': np.nan}
			
 
				+
			
 
				+    def search_for_best_pipeline(self,
			
 
				+                                 niter: int,
			
 
				+                                 algo: callable = tpe.suggest):
			
 
				+        '''
			
 
				+        Method performing the search of the best pipeline in the given space.
			
 
				+        Calls fmin function from the hyperopt library to minimize the output of
			
 
				+        _objective.
			
 
				+
			
 
				+        :params int niter: number of search iterations
			
 
				+        :param callable algo: now can only take values tpe for a tree-based
			
 
				+            random search or random for random search
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        assert(isinstance(niter, int)),\
			
 
				+            "Parameter 'niter' must be of int type"
			
 
				+
			
 
				+        # right now only two algorithms are provided by
			
 
				+        assert(algo in [tpe.suggest, rand.suggest]),\
			
 
				+            ("Parameter 'algo' can be now only tpe or random. "
			
 
				+             "If other algorithms have been developped by "
			
 
				+             "by hyperopt, plased add them to the list.")
			
 
				+
			
 
				+        try:
			
 
				+            self._logger.info(("Starting {0} iterations of search "
			
 
				+                               "additional to {1} previous"
			
 
				+                               .format(niter, len(self._trials.trials))))
			
 
				+
			
 
				+            best = fmin(fn=self._objective,
			
 
				+                        space=space,
			
 
				+                        algo=algo,
			
 
				+                        trials=self._trials,
			
 
				+                        max_evals=len(self._trials.trials) + niter)
			
 
				+
			
 
				+            # print('AAAA', str(niter))
			
 
				+
			
 
				+            self._logger.info(
			
 
				+                    "Best score is {0} with variance {1}"
			
 
				+                    .format(
			
 
				+                     self._trials.best_trial["result"]["score"],
			
 
				+                     self._trials.best_trial["result"]["score_variance"]))
			
 
				+
			
 
				+            self._logger.info(("Finished {0} iterations of search.\n"
			
 
				+                               "Best parameters are:\n {1} ")
			
 
				+                              .format(niter,
			
 
				+                                      space_eval(space, best)))
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            raise ValueError(("Failed to select best "
			
 
				+                             "pipeline! Exit with error: {}").format(e))
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            return self._trials.best_trial["result"]["score"]
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            return self._trials.best_trial["result"]["score_variance"]
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+
			
 
				+            return space_eval(
			
 
				+                    space,
			
 
				+                    {k: v[0] for k, v in
			
 
				+                     self._trials.best_trial['misc']['vals'].items()
			
 
				+                     if len(v) > 0})["pipeline"]
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def _ith_trial_loss(self, i: int) -> float:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return self._trials.trials[i]['result']['loss']
			
 
				+        else:
			
 
				+            return np.nan
			
 
				+
			
 
				+    def _ith_trial_element(self, i: int, name: str) -> object:
			
 
				+        '''
			
 
				+        '''
			
 
				+        assert(self._space_attached),\
			
 
				+            "Space must be attach to be able to retrieve this information."
			
 
				+
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return space_eval(self._space,
			
 
				+                              {k: v[0] for k, v in
			
 
				+                               self._trials.trials[i]['misc']['vals']
			
 
				+                               .items() if len(v) > 0})[name]
			
 
				+
			
 
				+    def _ith_trial_pipeline(self, i: int) -> Pipeline:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='pipeline')
			
 
				+
			
 
				+    def _ith_trial_name(self, i: int) -> str:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='name')
			
 
				+
			
 
				+    def _ith_trial_params(self, i: int) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._ith_trial_element(i=i, name='params')
			
 
				+
			
 
				+    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(self._trials.trials) >= i:
			
 
				+            return self._trials.trials[i]["result"]["timestamp"]
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
			
 
				+        '''
			
 
				+        Returns the list of n best pipelines
			
 
				+        documented in trials
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            if losses is None:
			
 
				+                losses = [self._ith_trial_loss(i)
			
 
				+                          for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+            best_n_indices = [losses.index(l)
			
 
				+                              for l in sorted(list(set(losses)))[:n]]
			
 
				+
			
 
				+            return [self._ith_trial_pipeline(i) for i in best_n_indices]
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
			
 
				+        '''
			
 
				+        Returns a dictiionry where keys are pipeline names,
			
 
				+        and values are lists of best pipelines with this name
			
 
				+        '''
			
 
				+        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
			
 
				+
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+
			
 
				+            best_pipelines_per_type = {}
			
 
				+            names = [self._ith_trial_name(i)
			
 
				+                     for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+            for nm in names:
			
 
				+                losses = [self._ith_trial_loss(i)
			
 
				+                          for i in range(len(self._trials.trials))
			
 
				+                          if self._ith_trial_name(i) == nm]
			
 
				+
			
 
				+                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
			
 
				+                                                        n=n,
			
 
				+                                                        losses=losses)
			
 
				+
			
 
				+            return best_pipelines_per_type
			
 
				+
			
 
				+        else:
			
 
				+            err = ("Trials object is empty. "
			
 
				+                   "Best pipeline cannot be returned")
			
 
				+
			
 
				+            self._logger.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+    def write_trials_documentation(self, path: str = None):
			
 
				+        '''
			
 
				+        Saves an excel file with pipeline names, scores,
			
 
				+        parameters, and timestamps.
			
 
				+        '''
			
 
				+        if len(self._trials.trials) > 0:
			
 
				+            path = path or "hyperopt_trials_documentation.xlsx"
			
 
				+
			
 
				+            assert(isinstance(path, str)),\
			
 
				+                "Parameter 'path' must be of string type"
			
 
				+
			
 
				+            self._assert_valid_directory(path)
			
 
				+
			
 
				+            names = [self._ith_trial_name(i)
			
 
				+                     for i in range(len(self._trials.trials))]
			
 
				+            scores = [self._score_factor*self._ith_trial_loss(i)
			
 
				+                      for i in range(len(self._trials.trials))]
			
 
				+            params = [self._ith_trial_params(i)
			
 
				+                      for i in range(len(self._trials.trials))]
			
 
				+            timestamps = [self._ith_trial_timestamp(i)
			
 
				+                          for i in range(len(self._trials.trials))]
			
 
				+
			
 
				+        else:
			
 
				+            names = []
			
 
				+            scores = []
			
 
				+            params = []
			
 
				+            timestamps = []
			
 
				+
			
 
				+        pd.DataFrame({"name": names,
			
 
				+                      "score": scores,
			
 
				+                      "params": params,
			
 
				+                      "timestamp": timestamps})\
			
 
				+          .to_excel(path)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    from sklearn.metrics import roc_auc_score, make_scorer
			
 
				+    from xgboost import XGBClassifier
			
 
				+    from sklearn.svm import SVC
			
 
				+    from sklearn.feature_selection import SelectKBest
			
 
				+    from sklearn.decomposition import PCA
			
 
				+    from sklearn.datasets import load_iris
			
 
				+    from pprint import pprint
			
 
				+
			
 
				+    data = load_iris()
			
 
				+    X = pd.DataFrame(data.data)
			
 
				+    y = pd.Series(data.target)
			
 
				+    # produce a binory variable
			
 
				+    y = (y == 2).astype(int)
			
 
				+    del data
			
 
				+    gc.collect()
			
 
				+
			
 
				+    # SPACE DEFINITION ########################################
			
 
				+    # (can be moved to a separate python script)
			
 
				+
			
 
				+    """
			
 
				+    A search space must be a list of dictionaries.
			
 
				+    Each dictionry must have keys:
			
 
				+        name (pipeline name or type),
			
 
				+        pipeline (instance of sklearn.pipeline.Pipeline),
			
 
				+        params (dictionary of distributions for the parameters of
			
 
				+                the pipeline that we want to tune)
			
 
				+
			
 
				+    Here we have a space that consists of two dictionaries:
			
 
				+    KBEST_XGBOOST and PCA_SVC
			
 
				+    """
			
 
				+    space = []
			
 
				+
			
 
				+    pipeline_dist_1 = {}
			
 
				+    pipeline_dist_1["name"] = "KBEST_XGBOOST"
			
 
				+
			
 
				+    """
			
 
				+    A pipeline consists of steps (tuples).
			
 
				+    Each step has a name and an algorithm.
			
 
				+    This pipeline, as a first step performs
			
 
				+    feature selection with SelectKBest and
			
 
				+    as a second step evaluates a machine learning algo (xgboost).
			
 
				+
			
 
				+    Like all sklearn algorithms, a Pipeline has methods
			
 
				+    fit, predict, set_params, get_params
			
 
				+    """
			
 
				+    pipeline_dist_1["pipeline"] = Pipeline([
			
 
				+                                     ('kbest', SelectKBest()),
			
 
				+                                     ('xgb', XGBClassifier())
			
 
				+                                     ])
			
 
				+    """
			
 
				+    Pipeline parameter dictionaries must be of the form:
			
 
				+    {'kbest__k': 3, xgb__n_estimators: 20},
			
 
				+    each parameter name consists of the step name, __, and parameter name.
			
 
				+
			
 
				+    Here, instead of values, the parameter names are followed
			
 
				+    by hyperopt distributions.
			
 
				+    Each hyperopt distribution also must have a name,
			
 
				+    due to hyperopt functionality.
			
 
				+
			
 
				+    Here, we set the hyperopt distribution name to the step name,
			
 
				+    but it does not have to be so. Hyperopt distribution names
			
 
				+    must be different for different elements of the space.
			
 
				+    """
			
 
				+
			
 
				+    pipeline_dist_1["params"] = {
			
 
				+            'kbest__k': hp.choice('kbest__k', range(1, 5)),
			
 
				+
			
 
				+            'xgb__n_estimators':
			
 
				+            50 + hp.randint('xgb__n_estimators', 50),
			
 
				+
			
 
				+            "xgb__learning_rate":
			
 
				+            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
			
 
				+            }
			
 
				+
			
 
				+    space.append(pipeline_dist_1)
			
 
				+
			
 
				+    pipeline_dist_2 = {}
			
 
				+    pipeline_dist_2["name"] = "PCA_SVC"
			
 
				+
			
 
				+    pipeline_dist_2["pipeline"] = Pipeline([
			
 
				+                                     ('pca', PCA()),
			
 
				+                                     ('svc', SVC(gamma="scale"))
			
 
				+                                     ])
			
 
				+
			
 
				+    pipeline_dist_2["params"] = {
			
 
				+            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
			
 
				+
			
 
				+            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
			
 
				+            }
			
 
				+
			
 
				+    space.append(pipeline_dist_2)
			
 
				+
			
 
				+    space = hp.choice('pipelines', space)
			
 
				+
			
 
				+    # TESTING ##########################################################
			
 
				+
			
 
				+    trials_path = 'TEST_hyperopt_trials.pkl'
			
 
				+
			
 
				+    doc_path = 'TEST_hyperopt_doc.xlsx'
			
 
				+
			
 
				+    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
			
 
				+                                       greater_is_better=True,
			
 
				+                                       trials_path=trials_path)
			
 
				+
			
 
				+    hp_obj.attach_data(X_train=X, y_train=y)
			
 
				+
			
 
				+    hp_obj.attach_space(space=space)
			
 
				+
			
 
				+    hp_obj.search_for_best_pipeline(niter=10)
			
 
				+
			
 
				+    print('\n', '='*20, 'TESTING', '='*20)
			
 
				+
			
 
				+    print('\n', 'Best score:', hp_obj.best_trial_score)
			
 
				+
			
 
				+    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
			
 
				+
			
 
				+    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
			
 
				+
			
 
				+    print('\n', 'Best 3 pipelines: \n')
			
 
				+    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
			
 
				+
			
 
				+    print('\n', 'Best pipeline per type: \n')
			
 
				+    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
			
 
				+
			
 
				+    hp_obj.write_trials_documentation(path=doc_path)
			
 
				+
			
 
				+    # os.remove(doc_path)
			
 
				+    # os.remove(trials_path)
			
--- a/import_process_instances/CleanProcessTable.py
+++ b/import_process_instances/CleanProcessTable.py
@@ -0,0 +1,130 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Sep 30 08:55:56 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.db_migration.MigrationCleaning import MigrationCleaning
			
 
				+
			
 
				+
			
 
				+class CleanTable(MigrationCleaning):
			
 
				+    '''
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self, mapping_path: str,
			
 
				+                 inconsist_report_table: str,
			
 
				+                 filter_index_columns: (str, list),
			
 
				+                 sort_columns: list = None,
			
 
				+                 index_columns: list = None,
			
 
				+                 log_name: str = "CleanProcessTable"):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(
			
 
				+                mapping_path=mapping_path,
			
 
				+                schema_paths=[os.path.join(".", "mongo_schema",
			
 
				+                                           "schema_process_instances.json"),
			
 
				+                              os.path.join(".", "mongo_schema",
			
 
				+                                           "schema_wheelsets.json"),
			
 
				+                              os.path.join(".", "mongo_schema",
			
 
				+                                           "schema_components.json")],
			
 
				+                inconsist_report_table=inconsist_report_table,
			
 
				+                filter_index_columns=filter_index_columns,
			
 
				+                log_name=log_name)
			
 
				+
			
 
				+        self._tablename = os.path.basename(self._mapping_path)\
			
 
				+                            .split("_mapping")[0]
			
 
				+
			
 
				+        self._sort_columns = sort_columns
			
 
				+        self._index_columns = index_columns
			
 
				+
			
 
				+        from libraries.db_handlers.SQLHandler import SQLHandler
			
 
				+
			
 
				+        self._sql_db = SQLHandler()
			
 
				+
			
 
				+    def read_data(self, wheelsets):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if len(wheelsets) > 1:
			
 
				+            query = "SELECT * FROM {0} WHERE radsatznummer in {1}"\
			
 
				+                    .format(self._tablename, tuple(wheelsets))
			
 
				+        else:
			
 
				+            query = "SELECT * FROM {0} WHERE radsatznummer = '{1}'"\
			
 
				+                    .format(self._tablename, wheelsets[0])
			
 
				+
			
 
				+        return self._sql_db.read_sql_to_dataframe(query)
			
 
				+
			
 
				+    def drop_duplicated_entries(self, data: pd.DataFrame,
			
 
				+                                columns_to_ignore: list = None
			
 
				+                                ) -> pd.DataFrame():
			
 
				+        '''
			
 
				+        '''
			
 
				+        if columns_to_ignore is None:
			
 
				+            columns_to_ignore = ["ende_der_bearbeitung"]
			
 
				+
			
 
				+        self.error_column_abscence(columns=columns_to_ignore, data=data)
			
 
				+
			
 
				+        defining_columns = [c for c in data.columns
			
 
				+                            if c not in columns_to_ignore]
			
 
				+
			
 
				+        return data.drop_duplicates(subset=defining_columns)\
			
 
				+                   .reset_index(drop=True)
			
 
				+
			
 
				+    @property
			
 
				+    def field_mapping(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._mapping_parser.get_field_mapping()
			
 
				+
			
 
				+
			
 
				+class CleanProcessTable(CleanTable):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, mapping_path: str,
			
 
				+                 inconsist_report_table: str = None,
			
 
				+                 filter_index_columns=["radsatznummer"],
			
 
				+                 sort_columns: list = None,
			
 
				+                 index_columns: list = None,
			
 
				+                 log_name: str = "CleanProcessTable"):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(
			
 
				+                mapping_path=mapping_path,
			
 
				+                sort_columns=sort_columns,
			
 
				+                index_columns=index_columns,
			
 
				+                inconsist_report_table=inconsist_report_table,
			
 
				+                filter_index_columns=filter_index_columns,
			
 
				+                log_name=log_name)
			
 
				+
			
 
				+    def _get_next_station_start_time(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        self.error_column_abscence(columns=["radsatznummer", "positionsnummer",
			
 
				+                                            "begin_der_bearbeitung"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data.sort_values(by=["radsatznummer", "begin_der_bearbeitung"],
			
 
				+                         inplace=True)
			
 
				+
			
 
				+        start_time_next_station =\
			
 
				+            data.groupby("radsatznummer")["begin_der_bearbeitung"].shift(-1)\
			
 
				+                .fillna("temp")
			
 
				+
			
 
				+        station_change = (data.groupby("radsatznummer")["positionsnummer"]
			
 
				+                              .shift(-1) != data["positionsnummer"])
			
 
				+
			
 
				+        start_time_next_station.loc[~station_change] = np.nan
			
 
				+
			
 
				+        start_time_next_station.fillna(method="bfill", inplace=True)
			
 
				+
			
 
				+        start_time_next_station.loc[start_time_next_station == "temp"] = np.nan
			
 
				+
			
 
				+        return pd.to_datetime(start_time_next_station)
			
--- a/import_process_instances/CleanRs0.py
+++ b/import_process_instances/CleanRs0.py
@@ -0,0 +1,87 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Sep 30 10:14:46 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.import_process_instances.CleanProcessTable import CleanTable
			
 
				+
			
 
				+
			
 
				+class CleanRs0(CleanTable):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(
			
 
				+                mapping_path=os.path.join(".", "migration_mappings",
			
 
				+                                          "rs0_mapping.json"),
			
 
				+                inconsist_report_table="inconsist_rs0",
			
 
				+                filter_index_columns=["radsatznummer"],
			
 
				+                sort_columns=["radsatznummer", "eingabe_datum"],
			
 
				+                index_columns=["radsatznummer", "eingabe_datum"],
			
 
				+                log_name="CleanRs0:")
			
 
				+
			
 
				+    def restrict_to_process_data(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        process_columns = ["radsatznummer", "aufarbeitungstyp", "ihs",
			
 
				+                           "befundung_code_1", "befundung_code_2",
			
 
				+                           "befundung_code_3"]
			
 
				+
			
 
				+        self.error_column_abscence(columns=process_columns,
			
 
				+                                   data=data)
			
 
				+
			
 
				+        return data[process_columns]
			
 
				+
			
 
				+    def add_ist_schrott(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        mongo_name = "final_state.ist_schrott"
			
 
				+
			
 
				+        self.error_column_abscence(columns=["aufarbeitungstyp"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data[mongo_name] = (data["aufarbeitungstyp"] == 2)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def restrict_to_meta_data(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        meta_columns = [c for c in data.columns if c not in
			
 
				+                        ["aufarbeitungstyp", "ihs",
			
 
				+                         "befundung_code_1", "befundung_code_2",
			
 
				+                         "befundung_code_3"]]
			
 
				+
			
 
				+        self.error_column_abscence(columns=meta_columns,
			
 
				+                                   data=data)
			
 
				+
			
 
				+        return data[meta_columns]
			
 
				+
			
 
				+    def filter_invalid_metacolumns(self, data: pd.DataFrame,
			
 
				+                                   metacolumns: list = None) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if metacolumns is None:
			
 
				+            metacolumns = ["wellentype", "Lagerbauart", "tauschgruppe"]
			
 
				+
			
 
				+        for column in metacolumns:
			
 
				+
			
 
				+            invalid_mask = data[column].isnull()
			
 
				+
			
 
				+            reason = "Missing {}".format(column)
			
 
				+
			
 
				+            data = self._filter_invalid_data(invalid_mask=invalid_mask,
			
 
				+                                             reason=reason,
			
 
				+                                             data=data)
			
 
				+
			
 
				+        return data
			
--- a/import_process_instances/CleanRs1.py
+++ b/import_process_instances/CleanRs1.py
@@ -0,0 +1,170 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Sep 30 09:59:54 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import gc
			
 
				+import pandas as pd
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
			
 
				+
			
 
				+
			
 
				+class CleanRs1(CleanProcessTable):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(
			
 
				+                mapping_path=os.path.join(".", "migration_mappings",
			
 
				+                                          "rs1_mapping.json"),
			
 
				+                inconsist_report_table="inconsist_rs1",
			
 
				+                sort_columns=["radsatznummer", "begin_der_bearbeitung"],
			
 
				+                index_columns=["radsatznummer"],
			
 
				+                log_name="CleanRs1")
			
 
				+
			
 
				+    def clean_ende_der_bearbeitung(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        We filter all the data that has missing begin_der_bearbeitung
			
 
				+         (these cases should be very rare),
			
 
				+         if ende_der_bearbeitung is missing we should fill it by the
			
 
				+         begin_der_bearbeitung of the next station.
			
 
				+        '''
			
 
				+        self.error_column_abscence(columns=["radsatznummer",
			
 
				+                                            "ende_der_bearbeitung",
			
 
				+                                            "begin_der_bearbeitung",
			
 
				+                                            "status"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        for time_column in ["ende_der_bearbeitung", "begin_der_bearbeitung"]:
			
 
				+            data[time_column] = pd.to_datetime(data[time_column])
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        start_time_next_station = self._get_next_station_start_time(data=data)
			
 
				+
			
 
				+        data["ende_der_bearbeitung"].fillna(start_time_next_station,
			
 
				+                                            inplace=True)
			
 
				+
			
 
				+        del start_time_next_station
			
 
				+        gc.collect()
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_invalid_ende_der_bearbeitung(self, data: pd.DataFrame
			
 
				+                                            ) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        is_invalid = (
			
 
				+                (data["ende_der_bearbeitung"].isnull() &
			
 
				+                 (data["status"] != "Aktiv")) |
			
 
				+                (data["begin_der_bearbeitung"].isnull()) |
			
 
				+                (data["ende_der_bearbeitung"] < data["begin_der_bearbeitung"]))
			
 
				+
			
 
				+        data = self._filter_invalid_data(
			
 
				+                    data=data,
			
 
				+                    invalid_mask=is_invalid,
			
 
				+                    reason="invalid ende der bearbeitung")
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_invalid_status(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        We filter out the cases when work at a station was finished
			
 
				+         with the status "Aktiv" or "Abbruch". An exception is the very last
			
 
				+         station per wheel-set because it can a non-finished process.
			
 
				+        '''
			
 
				+        self.error_column_abscence(columns=["radsatznummer",
			
 
				+                                            "positionsnummer",
			
 
				+                                            "status"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        is_station_change = (data["positionsnummer"] !=
			
 
				+                             data["positionsnummer"].shift(-1))
			
 
				+
			
 
				+        is_last_station = (data["radsatznummer"] !=
			
 
				+                           data["radsatznummer"].shift(-1))
			
 
				+
			
 
				+        has_invalid_status = (
			
 
				+                is_station_change &
			
 
				+                (~is_last_station) &
			
 
				+                (data["status"].isin(["Aktiv", "Abbruch"])))
			
 
				+
			
 
				+        data = self._filter_invalid_data(
			
 
				+                    data=data,
			
 
				+                    invalid_mask=has_invalid_status,
			
 
				+                    reason="invalid status")
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def add_finished(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        We add a variable indicating if the process is finished or not
			
 
				+        '''
			
 
				+        mongo_name = "final_state.finished"
			
 
				+
			
 
				+        self.error_column_abscence(columns=["radsatznummer", "status"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        not_finished = ["Aktiv", "Abbruch"]
			
 
				+
			
 
				+        last_status_map = data.groupby("radsatznummer")["status"].last()
			
 
				+
			
 
				+        data[mongo_name] = ~data["radsatznummer"].map(last_status_map)\
			
 
				+                                                 .isin(not_finished)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def add_stage(self, data: pd.DataFrame) -> pd.DataFrame():
			
 
				+        '''
			
 
				+        In the configuration we store the process stages definition in the form
			
 
				+         of the graph.
			
 
				+        '''
			
 
				+        from libraries.configuration import default as cfg
			
 
				+
			
 
				+        mongo_name = "process.stage"
			
 
				+
			
 
				+        self.error_column_abscence(columns=["radsatznummer", "positionsname"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        def cumsum_str(x):
			
 
				+            return x.cumsum()
			
 
				+
			
 
				+        def break_cum_string_to_list(x):
			
 
				+            return [int(st) for st in x.split("|")[:-1]]
			
 
				+
			
 
				+        previous_stations = data\
			
 
				+            .assign(positionsnummer=data.positionsnummer.astype(str).add("|"))\
			
 
				+            .groupby("radsatznummer")["positionsnummer"]\
			
 
				+            .apply(cumsum_str)\
			
 
				+            .apply(break_cum_string_to_list)
			
 
				+
			
 
				+        for stage in cfg.process_stages.nodes():
			
 
				+            this_stage_stations = cfg.process_stages.nodes()[stage]["stations"]
			
 
				+            next_stage_stations = [item for next_stage
			
 
				+                                   in cfg.process_stages.successors(stage)
			
 
				+                                   for item in cfg.process_stages.nodes()
			
 
				+                                   [next_stage]["stations"]]
			
 
				+
			
 
				+            def check_stage(x):
			
 
				+                return (len(set(this_stage_stations) & set(x)) != 0) and \
			
 
				+                       (len(set(next_stage_stations) & set(x)) == 0)
			
 
				+
			
 
				+            data.loc[previous_stations.apply(check_stage), mongo_name] = stage
			
 
				+
			
 
				+        return data
			
--- a/import_process_instances/CleanRs2.py
+++ b/import_process_instances/CleanRs2.py
@@ -0,0 +1,82 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Sep 30 10:06:48 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
			
 
				+
			
 
				+
			
 
				+class CleanRs2(CleanProcessTable):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(
			
 
				+                mapping_path=os.path.join(".", "migration_mappings",
			
 
				+                                          "rs2_mapping.json"),
			
 
				+                inconsist_report_table="inconsist_rs2",
			
 
				+                sort_columns=["radsatznummer", "ende_der_bearbeitung"],
			
 
				+                index_columns=["radsatznummer", "positionsnummer"],
			
 
				+                log_name="CleanRs2")
			
 
				+
			
 
				+    def filter_invalid_ende_der_bearbeitung(self, data: pd.DataFrame
			
 
				+                                            ) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        We filter out all the rows that have missing ende_der_bearbeitung,
			
 
				+         it means that the activities we planned, but not executed.
			
 
				+        '''
			
 
				+        self.error_column_abscence(columns=["radsatznummer",
			
 
				+                                            "ende_der_bearbeitung"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        is_invalid = (data["ende_der_bearbeitung"].isnull())
			
 
				+
			
 
				+        data = self._filter_invalid_data(
			
 
				+                    data=data,
			
 
				+                    invalid_mask=is_invalid,
			
 
				+                    reason="invalid ende der bearbeitung")
			
 
				+
			
 
				+        data["ende_der_bearbeitung"] =\
			
 
				+            pd.to_datetime(data["ende_der_bearbeitung"])
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def filter_invalid_taetigkeitsname(self, data: pd.DataFrame
			
 
				+                                       ) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        In the configuration we store a list of activities
			
 
				+         execution of which means that the wheel-set is scrap.
			
 
				+         After execution of this activities the process history should end.
			
 
				+        '''
			
 
				+        from libraries.configuration import default as cfg
			
 
				+
			
 
				+        self.error_column_abscence(columns=["radsatznummer",
			
 
				+                                            "taetigkeitsname"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        is_last_station = (
			
 
				+            data["radsatznummer"] !=
			
 
				+            data["radsatznummer"].shift(-1))
			
 
				+
			
 
				+        is_invalid = (
			
 
				+                ~is_last_station &
			
 
				+                (data["taetigkeitsname"].isin(cfg.schrott_taetigkeiten)))
			
 
				+
			
 
				+        data = self._filter_invalid_data(
			
 
				+                    data=data,
			
 
				+                    invalid_mask=is_invalid,
			
 
				+                    reason="invalid taetigkeit")
			
 
				+
			
 
				+        return data
			
--- a/import_process_instances/CleanRs70.py
+++ b/import_process_instances/CleanRs70.py
@@ -0,0 +1,58 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Sep 30 10:11:55 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+import pandas as pd
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
			
 
				+
			
 
				+
			
 
				+class CleanRs70(CleanProcessTable):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super().__init__(
			
 
				+                mapping_path=os.path.join(".", "migration_mappings",
			
 
				+                                          "rs70_mapping.json"),
			
 
				+                inconsist_report_table="inconsist_rs70",
			
 
				+                sort_columns=["radsatznummer", "eingabe_datum"],
			
 
				+                index_columns=["radsatznummer", "eingabe_datum"],
			
 
				+                log_name="CleanRs70")
			
 
				+
			
 
				+    def filter_invalid_schadcode(self, data: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        In the configuration we store a list of schadcodes assignment
			
 
				+         of which means that the product is scrap. No more schadcodes after
			
 
				+         this schadcode should be assigned.
			
 
				+        '''
			
 
				+        from libraries.configuration import default as cfg
			
 
				+
			
 
				+        self.error_column_abscence(columns=["radsatznummer", "schadcode"],
			
 
				+                                   data=data)
			
 
				+
			
 
				+        data.sort_values(by=self._sort_columns, inplace=True)
			
 
				+
			
 
				+        is_last_schadcode = (data["radsatznummer"] !=
			
 
				+                             data["radsatznummer"].shift(-1))
			
 
				+
			
 
				+        is_invalid = (~is_last_schadcode &
			
 
				+                      data["schadcode"].isin(cfg.schrott_schadcodes))
			
 
				+
			
 
				+        data = self._filter_invalid_data(
			
 
				+                    data=data,
			
 
				+                    invalid_mask=is_invalid,
			
 
				+                    reason="invalid schadcode")
			
 
				+
			
 
				+        # XXX temporary here
			
 
				+        # data["eingabe_datum"] = pd.to_datetime(data["eingabe_datum"])
			
 
				+
			
 
				+        return data
			
--- a/import_process_instances/MergeProcessTables.py
+++ b/import_process_instances/MergeProcessTables.py
@@ -0,0 +1,149 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Sep 30 10:16:23 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+from libraries.import_process_instances.CleanRs1 import CleanRs1
			
 
				+
			
 
				+
			
 
				+class MergeProcessTables:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def merge_rs2(self, data: pd.DataFrame, rs2: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        Difficulty: rows that correspond to one radsatznummer and one station
			
 
				+         in rs1 and rs2 are in many-to-many relation and
			
 
				+         the ende_der_bearbeitung
			
 
				+         for these rows often does not match in the two tables.
			
 
				+
			
 
				+        Rules:
			
 
				+            A) We check if the end_der_bearbeitung of
			
 
				+            an activity from rs2 is >=
			
 
				+            begin_der_bearbeitung and <= of
			
 
				+            ende_der_bearbeitung of an entry in rs1
			
 
				+
			
 
				+            B) If an activity (row in rs2) has ende_der_bearbeitung
			
 
				+            which is later
			
 
				+            than ende_der_bearbeitung of all the entries in rs1, we check if it
			
 
				+            ended earlier than the begin_der_bearbeitung on the next station.
			
 
				+            If it
			
 
				+            is so, we assign the activity to the latest entry in rs1 for
			
 
				+            this station.
			
 
				+
			
 
				+        Same logic applies for merging the table rs70.
			
 
				+        '''
			
 
				+        data = data.copy(deep=True)
			
 
				+        rs2 = rs2.copy(deep=True)
			
 
				+
			
 
				+        station_change = (data["positionsnummer"] !=
			
 
				+                          data["positionsnummer"].shift(-1))
			
 
				+
			
 
				+        data["order"] = data.index
			
 
				+
			
 
				+        common_columns = ["radsatznummer", "positionsnummer",
			
 
				+                          "positionsname"]
			
 
				+
			
 
				+        data = pd.merge(data, rs2, how="left", on=common_columns)
			
 
				+
			
 
				+        start_time_next_station =\
			
 
				+            CleanRs1()._get_next_station_start_time(data)\
			
 
				+                      .fillna(data["ende_der_bearbeitung_x"])
			
 
				+
			
 
				+        start_matches = (data["ende_der_bearbeitung_y"] >=
			
 
				+                         data["begin_der_bearbeitung"])
			
 
				+
			
 
				+        end_matches = ((data["ende_der_bearbeitung_y"] <=
			
 
				+                       data["ende_der_bearbeitung_x"]) |
			
 
				+                       data["ende_der_bearbeitung_y"].isnull())
			
 
				+
			
 
				+        end_almost_matches = ((data["ende_der_bearbeitung_y"] <=
			
 
				+                               start_time_next_station) &
			
 
				+                              station_change
			
 
				+                              )
			
 
				+
			
 
				+        time_matches = (start_matches & end_matches) |\
			
 
				+                       (start_matches & (~end_matches) & end_almost_matches)
			
 
				+
			
 
				+        rs2_columns = [c for c in rs2.columns
			
 
				+                       if (c not in common_columns) and (c in data.columns)] +\
			
 
				+                      [c + "_y" for c in rs2.columns
			
 
				+                       if c + "_y" in data.columns]
			
 
				+
			
 
				+        for c in rs2_columns:
			
 
				+            data.loc[~time_matches, c] = np.nan
			
 
				+
			
 
				+        data.sort_values(by=["radsatznummer",
			
 
				+                             "begin_der_bearbeitung",
			
 
				+                             "ende_der_bearbeitung_y"],
			
 
				+                         inplace=True)
			
 
				+
			
 
				+        # we keep all the rows that were in rs1 even if there are no
			
 
				+        # corresponding activities from rs2
			
 
				+        keep_row = time_matches | (~data["order"].duplicated(keep="first"))
			
 
				+
			
 
				+        data = data.loc[keep_row].copy(deep=True).reset_index(drop=True)
			
 
				+
			
 
				+        data["ende_der_bearbeitung"] = data[["ende_der_bearbeitung_x",
			
 
				+                                             "ende_der_bearbeitung_y"]]\
			
 
				+            .max(axis=1)
			
 
				+
			
 
				+        data.drop(["ende_der_bearbeitung_x", "ende_der_bearbeitung_y",
			
 
				+                   "order"], axis=1, inplace=True)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def merge_rs70(self, data: pd.DataFrame, rs70: pd.DataFrame
			
 
				+                   ) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        data["order"] = data.index
			
 
				+
			
 
				+        data = pd.merge(data, rs70, how="left", on="radsatznummer")
			
 
				+
			
 
				+        time_matches = (
			
 
				+                (data["eingabe_datum"] >= data["begin_der_bearbeitung"]) &
			
 
				+                (data["eingabe_datum"] <= data["ende_der_bearbeitung"]))
			
 
				+
			
 
				+        rs70_columns = [c for c in rs70.columns
			
 
				+                        if (c != "radsatznummer") and (c in data.columns)] +\
			
 
				+                       [c + "_y" for c in rs70.columns
			
 
				+                        if c + "_y" in data.columns]
			
 
				+
			
 
				+        for c in rs70_columns:
			
 
				+            data.loc[~time_matches, c] = np.nan
			
 
				+
			
 
				+        data.sort_values(by=["radsatznummer", "begin_der_bearbeitung",
			
 
				+                             "eingabe_datum"], inplace=True)
			
 
				+
			
 
				+        keep_row = time_matches | (~data["order"].duplicated(keep="first"))
			
 
				+
			
 
				+        data = data.loc[keep_row]\
			
 
				+                   .drop("order", axis=1)\
			
 
				+                   .reset_index(drop=True)
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def merge_rs0(self, data: pd.DataFrame, rs0: pd.DataFrame) -> pd.DataFrame:
			
 
				+        '''
			
 
				+        '''
			
 
				+        data = pd.merge(data, rs0, how="left", on="radsatznummer")
			
 
				+
			
 
				+        no_befundung_mask = (data["positionsnummer"] != 110)
			
 
				+
			
 
				+        for column in ["befundung_code_1",
			
 
				+                       "befundung_code_2",
			
 
				+                       "befundung_code_3"]:
			
 
				+
			
 
				+            data.loc[no_befundung_mask, column] = np.nan
			
 
				+
			
 
				+        return data
			
--- a/import_process_instances/__pycache__/CleanProcessTable.cpython-37.pyc
+++ b/import_process_instances/__pycache__/CleanProcessTable.cpython-37.pyc
--- a/import_process_instances/__pycache__/CleanRs0.cpython-37.pyc
+++ b/import_process_instances/__pycache__/CleanRs0.cpython-37.pyc
--- a/import_process_instances/__pycache__/CleanRs1.cpython-37.pyc
+++ b/import_process_instances/__pycache__/CleanRs1.cpython-37.pyc
--- a/import_process_instances/__pycache__/CleanRs2.cpython-37.pyc
+++ b/import_process_instances/__pycache__/CleanRs2.cpython-37.pyc
--- a/import_process_instances/__pycache__/CleanRs70.cpython-37.pyc
+++ b/import_process_instances/__pycache__/CleanRs70.cpython-37.pyc
--- a/import_process_instances/__pycache__/MergeProcessTables.cpython-37.pyc
+++ b/import_process_instances/__pycache__/MergeProcessTables.cpython-37.pyc
--- a/import_process_instances/__pycache__/parallelized_import.cpython-37.pyc
+++ b/import_process_instances/__pycache__/parallelized_import.cpython-37.pyc
--- a/import_process_instances/parallelized_import.py
+++ b/import_process_instances/parallelized_import.py
@@ -0,0 +1,74 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Tue Oct  1 11:15:03 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from typing import Callable
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+def get_all_wheelsets():
			
 
				+    '''
			
 
				+    return: list of distinct wheelset numbers in the process
			
 
				+    '''
			
 
				+    from libraries.db_handlers.SQLHandler import SQLHandler
			
 
				+
			
 
				+    sql_db = SQLHandler()
			
 
				+
			
 
				+    query = "SELECT DISTINCT radsatznummer FROM rs1"
			
 
				+
			
 
				+    return sql_db.read_sql_to_dataframe(query)["radsatznummer"].tolist()
			
 
				+
			
 
				+
			
 
				+def parallelized_import(all_instances: list,
			
 
				+                        mongo_schema_path: str,
			
 
				+                        import_chunk: Callable,
			
 
				+                        log_name: str = None):
			
 
				+
			
 
				+    from concurrent.futures import ThreadPoolExecutor
			
 
				+
			
 
				+    from libraries.db_handlers.MongodbHandler import MongodbHandler
			
 
				+
			
 
				+    from libraries.log import Log
			
 
				+
			
 
				+    import argparse
			
 
				+
			
 
				+    argparser = argparse.ArgumentParser(description='Import process instances collection')
			
 
				+    argparser.add_argument('--chunksize', type=int, default=100, help="Number of wheelsets processed at a time")
			
 
				+    argparser.add_argument('--max_workers', type=int, default=10, help="Number of workers in ThreadPoolExecutor")
			
 
				+    args = argparser.parse_args()
			
 
				+
			
 
				+    log = Log(log_name)
			
 
				+
			
 
				+    log.info("Start application")
			
 
				+    log.info("Processing {0} wheelsets at a time parallelized with {1} workers"
			
 
				+             .format(args.chunksize, args.max_workers))
			
 
				+
			
 
				+    collection_name = os.path.basename(mongo_schema_path).strip("schema_").split(".")[0]
			
 
				+
			
 
				+    mongodb = MongodbHandler()
			
 
				+
			
 
				+    mongodb.create_collection_and_set_schema(
			
 
				+            collection_name=collection_name,
			
 
				+            schema_path=mongo_schema_path)
			
 
				+
			
 
				+    try:
			
 
				+        n_chunks = len(all_instances)//args.chunksize + 1
			
 
				+
			
 
				+        with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
			
 
				+            for i in range(n_chunks):
			
 
				+                executor.submit(import_chunk,
			
 
				+                                all_instances[i*args.chunksize:(i+1)*args.chunksize], i)
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        err = ("Failed to import {0} in mongodb. "
			
 
				+               "Exit with error: {1}".format(collection_name, e))
			
 
				+        log.error(err)
			
 
				+        raise Exception(e)
			
 
				+
			
 
				+    log.info("Finished application")
			
--- a/log.py
+++ b/log.py
@@ -0,0 +1,58 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+@author: jürgen.pannosch, tanja.zolotareva
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+import logging
			
 
				+
			
 
				+
			
 
				+class Log:
			
 
				+    def __init__(self, name: str = None,
			
 
				+                 log_file: str = None,
			
 
				+                 log_level: str = "INFO",
			
 
				+                 print_to_stdout: bool = True):
			
 
				+        """Sets the log level and the path where the log file is stored
			
 
				+
			
 
				+        :param log_file: Path to the log file.
			
 
				+        :param log_level: Log level."""
			
 
				+
			
 
				+        if name is None:
			
 
				+            name = ''
			
 
				+
			
 
				+        self._logger = logging.getLogger(name)
			
 
				+
			
 
				+        if (self._logger.hasHandlers()):
			
 
				+            self._logger.handlers.clear()
			
 
				+
			
 
				+        if log_file is None:
			
 
				+            log_file = os.path.join(".", "all.log")
			
 
				+
			
 
				+        assert(isinstance(log_file, str)),\
			
 
				+            "Parameter 'log_path' must be of string type"
			
 
				+
			
 
				+        formatter = logging.Formatter(
			
 
				+                '\n %(name)s %(asctime)s %(levelname)s %(message)s')
			
 
				+
			
 
				+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
			
 
				+
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setFormatter(formatter)
			
 
				+        self._logger.addHandler(file_handler)
			
 
				+
			
 
				+        if print_to_stdout:
			
 
				+            stream_handler = logging.StreamHandler(sys.stdout)
			
 
				+            stream_handler.setFormatter(formatter)
			
 
				+            self._logger.addHandler(stream_handler)
			
 
				+
			
 
				+        self._logger.setLevel(log_level)
			
 
				+
			
 
				+    def info(self, message: str):
			
 
				+        self._logger.info(message)
			
 
				+
			
 
				+    def warning(self, message: str):
			
 
				+        self._logger.warning(message)
			
 
				+
			
 
				+    def error(self, message: str):
			
 
				+        self._logger.error(message)
			
--- a/utils/ClassLogging.py
+++ b/utils/ClassLogging.py
@@ -0,0 +1,73 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Sep 27 14:20:58 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+import pandas as pd
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class ClassLogging:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, log_name: str = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        from libraries.log import Log
			
 
				+
			
 
				+        self._log = Log(log_name)
			
 
				+
			
 
				+    def log_and_raise(self, message):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._log.error(message)
			
 
				+
			
 
				+        raise Exception(message)
			
 
				+
			
 
				+    def log_and_warn(self, message):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._log.warning(message)
			
 
				+
			
 
				+    def check_is_file(self, path):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if not os.path.isfile(path):
			
 
				+            err = "File {} not found".format(path)
			
 
				+            self._log.error(err)
			
 
				+            raise FileNotFoundError(err)
			
 
				+
			
 
				+    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
			
 
				+                               error_or_warning: str):
			
 
				+        '''
			
 
				+        '''
			
 
				+        if isinstance(columns, str):
			
 
				+            columns = [columns]
			
 
				+
			
 
				+        for column in columns:
			
 
				+
			
 
				+            if column not in data.columns:
			
 
				+                err = ("{} is not an internal column name".format(column))
			
 
				+                getattr(self._log, error_or_warning)(err)
			
 
				+
			
 
				+                if error_or_warning == "error":
			
 
				+                    raise Exception(err)
			
 
				+
			
 
				+    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._check_column_abscence(columns=columns,
			
 
				+                                           data=data,
			
 
				+                                           error_or_warning="error")
			
 
				+
			
 
				+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._check_column_abscence(columns=columns,
			
 
				+                                           data=data,
			
 
				+                                           error_or_warning="warning")
			
--- a/utils/CleaningUtils.py
+++ b/utils/CleaningUtils.py
@@ -0,0 +1,62 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Sep 27 16:20:03 2019
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+class CleaningUtils:
			
 
				+    '''
			
 
				+    '''
			
 
				+    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
			
 
				+        '''
			
 
				+        '''
			
 
				+        formats = list(formats)
			
 
				+
			
 
				+        converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
			
 
				+
			
 
				+        for formt in formats:
			
 
				+            if formt == "%d%m%Y":
			
 
				+                missing_leading_zero = (series.astype(str).str.len() == 7)
			
 
				+
			
 
				+                series = series.astype(str)
			
 
				+
			
 
				+                series.loc[missing_leading_zero] = "0" +\
			
 
				+                    series.loc[missing_leading_zero]
			
 
				+
			
 
				+            converted_this_format = pd.to_datetime(series,
			
 
				+                                                   format=formt,
			
 
				+                                                   errors="coerce")
			
 
				+
			
 
				+            converted.fillna(converted_this_format, inplace=True)
			
 
				+
			
 
				+        return converted
			
 
				+
			
 
				+    def standarize_writing(self, s: str):
			
 
				+        '''
			
 
				+        '''
			
 
				+        import re
			
 
				+
			
 
				+        german_character_mapping = {"ß": "ss",
			
 
				+                                    "ü": "ue",
			
 
				+                                    "Ü": "Ue",
			
 
				+                                    "ä": "ae",
			
 
				+                                    "Ä": "Ae",
			
 
				+                                    "ö": "oe",
			
 
				+                                    "Ö": "Oe"}
			
 
				+
			
 
				+        s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
			
 
				+        for char, correct_char in german_character_mapping.items():
			
 
				+            s = s.replace(char, correct_char)
			
 
				+
			
 
				+        s = s.lower()
			
 
				+
			
 
				+        s = re.sub('[^0-9a-zA-Z]+', '_', s)
			
 
				+
			
 
				+        return s
			
 
				+
			
--- a/utils/__pycache__/ClassLogging.cpython-37.pyc
+++ b/utils/__pycache__/ClassLogging.cpython-37.pyc
--- a/utils/__pycache__/CleaningUtils.cpython-37.pyc
+++ b/utils/__pycache__/CleaningUtils.cpython-37.pyc