tanja лет назад: 5
Родитель
Сommit
03ba54fdbd
40 измененных файлов с 5168 добавлено и 0 удалено
  1. BIN
      cdplib/__pycache__/__init__.cpython-37.pyc
  2. 270 0
      cdplib/feature_engineering/StatisticalFeatures.py
  3. 77 0
      cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
  4. 53 0
      cdplib/feature_engineering/StatisticalFeaturesOverTime.py
  5. 798 0
      cdplib/hyperopt/HyperoptPipelineSelection.py
  6. 211 0
      db_handlers/MongodbHandler.py
  7. 595 0
      db_handlers/SQLHandler.py
  8. BIN
      db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
  9. BIN
      db_handlers/__pycache__/SQLHandler.cpython-37.pyc
  10. BIN
      db_handlers/__pycache__/SQLOperations.cpython-37.pyc
  11. 352 0
      db_migration/DataFrameToCollection.py
  12. 520 0
      db_migration/MigrationCleaning.py
  13. 62 0
      db_migration/ParseDbSchema.py
  14. 332 0
      db_migration/ParseJsonSchema.py
  15. 157 0
      db_migration/ParseMapping.py
  16. BIN
      db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
  17. BIN
      db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
  18. BIN
      db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
  19. BIN
      db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
  20. BIN
      db_migration/__pycache__/ParseMapping.cpython-37.pyc
  21. 798 0
      hyperopt/HyperoptPipelineSelection.py
  22. 130 0
      import_process_instances/CleanProcessTable.py
  23. 87 0
      import_process_instances/CleanRs0.py
  24. 170 0
      import_process_instances/CleanRs1.py
  25. 82 0
      import_process_instances/CleanRs2.py
  26. 58 0
      import_process_instances/CleanRs70.py
  27. 149 0
      import_process_instances/MergeProcessTables.py
  28. BIN
      import_process_instances/__pycache__/CleanProcessTable.cpython-37.pyc
  29. BIN
      import_process_instances/__pycache__/CleanRs0.cpython-37.pyc
  30. BIN
      import_process_instances/__pycache__/CleanRs1.cpython-37.pyc
  31. BIN
      import_process_instances/__pycache__/CleanRs2.cpython-37.pyc
  32. BIN
      import_process_instances/__pycache__/CleanRs70.cpython-37.pyc
  33. BIN
      import_process_instances/__pycache__/MergeProcessTables.cpython-37.pyc
  34. BIN
      import_process_instances/__pycache__/parallelized_import.cpython-37.pyc
  35. 74 0
      import_process_instances/parallelized_import.py
  36. 58 0
      log.py
  37. 73 0
      utils/ClassLogging.py
  38. 62 0
      utils/CleaningUtils.py
  39. BIN
      utils/__pycache__/ClassLogging.cpython-37.pyc
  40. BIN
      utils/__pycache__/CleaningUtils.cpython-37.pyc

BIN
cdplib/__pycache__/__init__.cpython-37.pyc


+ 270 - 0
cdplib/feature_engineering/StatisticalFeatures.py

@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" 
+Created on Tue Oct 16 16:08:47 2018
+
+@author: tanya
+"""
+import types
+import logging
+import pandas as pd
+
+from collections import defaultdict
+from functools import reduce
+
+from libraries.logging.logging_utils import configure_logging
+from libraries.exception_handling import InputChecks
+          
+class StatisticalFeatures:
+    '''
+    Groups data by index columns and returns aggregated statistics for given columns
+    
+    :param list of tuples or dict index_cols: 
+        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
+                                             (colname_2, aggfunc_3)]
+        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
+        where colname_i is column to aggregate and aggfunc_i are either 
+        function variables or strings accepted by pandas for built-in function names.
+        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
+        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
+        then only the first one is kept.
+        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
+        
+    '''
+    def __init__(self, data, index_cols, path_to_log = None):
+        '''
+        '''
+        configure_logging(path_to_log)
+            
+        self.logger = logging.getLogger(__name__)
+        
+        self.checks = InputChecks(logger = self.logger)
+        
+        self.data = data
+        
+        self.checks.assert_correct_type({'data', [pd.DataFrame]})
+            
+        self.index_cols = index_cols
+        
+        # make warning about missing values in index columns
+        for col in self.index_cols:
+            if data[col].isnull().any():
+                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
+
+        
+    def get_kpis_by_aggregation(self, kpis):
+        '''
+        Aggregates given fields with given aggregation functions
+         USE CASE: per product find mean and standard variation of a price
+        
+        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
+         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
+         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
+         
+        :return: features with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        def get_valid_agg_dict_from_kpis(kpis):
+            '''
+            Filters inputs of incorrect shape or type,
+            Filters out columns not present in data
+            Removes multiple functions with the same name
+            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
+            Reports to the log
+            :param list or dict kpis:
+            '''
+            def get_name(x):
+                '''
+                Returns function name for function and does nothing for string
+                '''
+                if isinstance(x, types.FunctionType):
+                    return x.__name__
+                else:
+                    return x
+                
+            def passed_first_line_type_control(col, aggfunc):
+                '''
+                Checks if aggregation works on the first 2 lines of the data
+                '''
+                try:
+                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
+                    self.data.iloc[:2]\
+                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
+                             .groupby(self.index_cols)\
+                             .agg({col : aggfunc})
+                    return True
+                except Exception as e:
+                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
+                    return False
+           
+            
+            
+            valid_kpi_dict = defaultdict(list)
+            
+            if isinstance(kpis, list):
+                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
+                if sum(incorrect_lengths) > 0:
+                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
+                
+                cols = list(zip(*kpis))[0]             
+                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
+            elif isinstance(kpis, dict):
+                cols = list(kpis.keys())
+                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
+                
+            cols_not_in_data = set(cols) - set(self.data.columns)
+            if len(cols_not_in_data) > 0:
+                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
+                
+            for col, aggfuncs in kpis:
+                if not isinstance(aggfuncs, list):
+                    aggfuncs = [aggfuncs]
+                
+                for aggfunc in aggfuncs:
+                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
+                    if not is_new_funcname:
+                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
+                    
+                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
+                        valid_kpi_dict[col].append(aggfunc)
+                    
+            return valid_kpi_dict
+                   
+        
+        
+        
+        agg_dict = get_valid_agg_dict_from_kpis(kpis)
+        
+        if len(agg_dict) > 0:
+        
+            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
+                             else '_'.join([col, str(aggfunc)]) 
+                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
+            
+            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
+            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
+                       .groupby(self.index_cols)\
+                       .agg(agg_dict)\
+                       .set_axis(new_names, axis = 'columns', inplace = False)\
+                       .reset_index()
+        else:
+            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
+        
+        
+        
+        
+        
+        
+        
+    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
+        '''
+        A wrapper crosstab method with index equal to index_cols
+        USE CASE: per product find standart variation of the price in each city
+        
+        :param str pivot_col: column values of which become columns in the output
+        :param str value_col: column name to fillin vlaues
+        :param str or func aggfunc: count if None
+        :param list entries: values of pivot_col to show
+        :return: table with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        
+        # assert that types of the inputs are correct
+        types_to_check = {'columns' : [str], 
+                          'value_col' : [str, type(None)],  
+                          'aggfunc' : ['str', types.FunctionType, type(None)], 
+                          'entries' : [list, type(None)]}
+        
+        self.checks.assert_correct_type(types_to_check)
+        
+        cols_to_check = [pivot_col]
+        if not value_col is None:
+            cols_to_check.append(value_col)
+        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
+
+        if not entries is None:
+            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
+        else:
+            entry_filter = pd.Series([True]*len(self.data))              
+    
+        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
+        columns = self.data.loc[entry_filter, pivot_col]
+        if not value_col is None:
+            value_col = self.data.loc[entry_filter, value_col]
+                        
+        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
+        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
+                       .reset_index()
+        return result
+    
+
+
+
+
+        
+    
+    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
+        '''
+        Aggregates values obtained with method get_value_stats
+         USE CASE: per product find average variation of the price over all cities
+         
+        :param str pivot_col:
+        :param str value_col:
+        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
+        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
+        :param list entries: 
+        :return: table with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
+        
+        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
+
+        result = value_stat_kpis[self.index_cols].copy(deep = True)
+        
+        for aggfunc in aggfuncs_step2:
+            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
+            
+            if isinstance(aggfunc, str):
+                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
+            else:
+                result[colname] = value_stat_kpis.set_index(self.index_cols)\
+                                                 .apply(aggfunc, axis = 1)\
+                                                 .reset_index(drop = True)
+                                                 
+        return result
+                              
+                              
+                              
+                              
+                                                            
+    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
+        '''
+        Finds argmin or argmax of a column
+         USE CASE: per product find the city with maximum variation of the price
+        
+        :param str min_or_max: must be in ['min', 'max']
+        :param str pivot_col:
+        :param str value_col:
+        :param str aggfunc:    
+        '''
+        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
+        
+        if min_or_max == 'max':
+            aggfuncs_step2 = ['idxmax']
+        else:
+            aggfuncs_step2 = ['idxmin']
+            
+        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
+                                                   value_col = value_col, 
+                                                   aggfunc_step1 = aggfunc, 
+                                                   aggfucs_step2 = aggfuncs_step2)
+        
+        
+        
+        
+    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
+    
+    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
+    
+    # write tests for all methods

+ 77 - 0
cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py

@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  7 15:11:21 2018
+
+@author: tanya
+"""
+
+import pandas as pd
+
+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
+
+
+class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
+    '''
+    '''
+    
+    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
+        '''
+        '''
+        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
+                                                                   index_cols = index_cols,
+                                                                   date_col = date_col,
+                                                                   split_date = split_date,
+                                                                   period_length = n_periods*period_length,
+                                                                   past_or_future = past_or_future,
+                                                                   freq = freq,
+                                                                   path_to_log)
+        
+        self.period_number_col = 'period_number'
+        while period_number_col in data.columns:
+            self.period_number_col += '&'
+        
+        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
+                            .groupby(index_cols)[date_col].cumcount()\
+                            .reset_index()\
+                            .assign(period_number = lambda x: x[0]/period_length)\
+                            .rename(columns = {'period_number' : self.period_number_col})
+                                       
+                
+        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
+                            
+        self.initial_index_cols = self.index_cols.copy()
+        self.index_cols.append(self.period_number_col)
+        
+        
+    def _aggregate_over_time_periods(df):
+        '''
+        '''
+        return df.drop(self.period_number_col, axis = 1)\
+                 .groupby(self.initial_index_cols)\
+                 .mean()\
+                 .reset_index()
+        
+        
+    def get_kpis_by_aggregation(self, **args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                      .get_kpis_by_aggregation(**args))
+            
+            
+    def get_value_stats(self, **args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                 .get_value_stats(**args))
+        
+        
+    def get_aggregated_value_stats(self, args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                 .get_aggregated_value_stats(**args))
+        
+    
+        

+ 53 - 0
cdplib/feature_engineering/StatisticalFeaturesOverTime.py

@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  7 14:02:18 2018
+
+@author: tanya
+"""
+
+import logging
+import pandas as pd
+
+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
+from libraries.exception_handling import InputChecks, InputCasts
+from libraries.logging.logging_utils import configure_logging
+
+class StatisticalFeaturesOverTime(StatisticalFeatures):
+    '''
+    '''
+    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
+        '''
+        '''
+        configure_logging(path_to_log)
+        self.logger = logging.getLogger(__name__)
+        self.checks = InputChecks(logger = self.logger)
+        self.casts = InputCasts(logger = self.logger)
+        
+        self.checks.assert_column_presence(data = data, colnames = [date_col])
+        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
+        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
+        
+        
+        if past_or_future == 'past':
+            if not period_length is None:
+                min_date = split_date - pd.DateOffset(**{freq : period_length})
+            else:
+                min_date = data[date_col].min()
+            sup_date = split_date
+        else:
+            min_date = split_date
+            if not period_length is None:
+                sup_date = split_date + pd.DateOffset(**{freq : period_length})
+            else: 
+                sup_date = split_date + pd.DateOffset(**{freq : 1})
+            
+        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
+        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
+        
+            
+        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
+        
+        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
+                                                    index_cols = index_cols,
+                                                    path_to_log = path_to_log)

+ 798 - 0
cdplib/hyperopt/HyperoptPipelineSelection.py

@@ -0,0 +1,798 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Nov  9 13:27:44 2018
+
+@author: tanja
+@description: Implementation of machine learning
+                pipeline selection and tuning with hyperopt library
+"""
+
+import os
+import sys
+import gc
+import logging
+import pickle
+import time
+import datetime
+
+import pandas as pd
+import numpy as np
+
+from sklearn.pipeline import Pipeline
+
+from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
+    space_eval, pyll
+
+from sklearn.model_selection import cross_validate
+
+
+class HyperoptPipelineSelection:
+    '''
+    Use this class to perform a search
+    for a machine learning pipeline in a given parameter space.
+    The parameter space can include multiple types of Pipelines
+    (SVM, XGBOOST, random forest, etc),
+    as well as parameter distributions for each pipeline parameter.
+    See example in main for the expected space structure.
+
+    The search can be performed either randomly
+    or with a tree-based algorithm. (Other methods are currently
+    developped by hyperopt creators).
+
+    Attribute trials is responsible for book-keeping parameter
+    combinations that have already been tried out. This attribute
+    is saved to a binary file every n minutes as well as every time
+    a better pipeline was found.
+    '''
+    def __init__(self,
+                 cost_func,
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: int = 1,
+                 log_path: str = None,
+                 averaging_func: callable = None):
+        '''
+        :param callable cost_func: function to minimize or maximize
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from
+            the beginning.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+
+        :param str log_path: Optional, when not provided logs to stdout.
+
+        :param callable averaging_func: optional,
+            when not provided set to mean. Function
+            to aggregate the cross-validated values of the cost function.
+            Classic situation is to take the mean,
+            another example is, for example mean() - c*var().
+        '''
+
+        assert(callable(cost_func)),\
+            "Parameter 'cost_func' must be a callable"
+
+        assert(isinstance(greater_is_better, bool)),\
+            "Parameter 'greater_is_better' must be bool type"
+
+        assert(isinstance(trials_path, str)),\
+            "Parameter 'trials_path' must be of string type"
+
+        if averaging_func is not None:
+            assert(callable(averaging_func)),\
+                "Parameter 'averaging_func' must be a callable"
+
+        self._assert_valid_directory(path=trials_path)
+
+        self._configer_logger(log_path)
+
+        self._cost_func = cost_func
+        # is 1 when cost_func is minimized, -1 when cost func is maximized
+        self._score_factor = (not greater_is_better) - greater_is_better
+        self._trials_path = trials_path
+        # is initialized with empty trials object
+        self._trials = Trials()
+        self._backup_trials_freq = backup_trials_freq
+        self._averaging_func = averaging_func or np.mean
+        # keeping track of the current search iteration
+        self._run_number = 0
+        # space and data need to be attached to perform search.
+        self._space_attached = False
+        self._data_attached = False
+
+        # if a trials object already exists at the given path,
+        # it is loaded and the search is continued. Else,
+        # the search is started from the beginning.
+        if os.path.isfile(trials_path):
+            try:
+                with open(trials_path, "rb") as f:
+                    self._trials = pickle.load(f)
+
+                self._logger.info(("Loaded an existing trials object"
+                                   "Consisting of {} trials")
+                                  .format(len(self._trials.trials)))
+
+            except Exception as e:
+                self._logger.error(("Trials object could not be loaded. "
+                                    "Training starts from the beginning. "
+                                    "Exit with error {}").format(e))
+
+        else:
+            self._logger.info(("No existing trials object was found"
+                               "Initialized an empty trials object."))
+
+        self._best_score = self.best_trial_score
+
+    def _configer_logger(self, log_path: str = None):
+        '''
+        Can be replaced with the existing script later.
+        When log_path is not provided, logs to stdout.
+        '''
+
+        self._logger = logging.getLogger(__name__)
+
+        if (self._logger.hasHandlers()):
+            self._logger.handlers.clear()
+
+        if log_path is not None:
+            assert(isinstance(log_path, str)),\
+                "Parameter 'log_path' must be of string type"
+            self._assert_valid_directory(log_path)
+
+            handler = logging.FileHandler(log_path)
+        else:
+            handler = logging.StreamHandler(sys.stdout)
+
+        formatter = logging.Formatter(
+                '\n %(asctime)s %(levelname)s %(message)s')
+
+        handler.setFormatter(formatter)
+        self._logger.addHandler(handler)
+        self._logger.setLevel("INFO")
+
+    def _backup_trials(self):
+        '''
+        Pickles (Saves) the trials object.
+        Used in a scheduler.
+        '''
+        with open(self._trials_path, "wb") as f:
+            pickle.dump(self._trials, f)
+
+    def _assert_valid_directory(self, path: str):
+        '''
+        If the directory of a path does not exist yet,
+        creates it.
+        '''
+        assert(isinstance(path, str)),\
+            "Parameter 'path' must of str type"
+
+        dirname = os.path.dirname("path")
+
+        if len(dirname) > 0:
+            os.mkdir(dirname, exists_ok=True)
+
+    def attach_space(self, space: pyll.base.Apply = None,
+                     module_path: str = None,
+                     name: str = None):
+        '''
+        :param pyll.base.Apply space: hyperopt space where
+            the search is performed. Optional when a space
+            is loaded from a python module.
+
+        :param str module_path: path to python module
+            where the space is defined. Optional when
+            the space is provided directly.
+
+        :param str name: name of the space loaded from
+            a python module. Optional when the space
+            is provided directly.
+        '''
+        assert((space is not None) or
+               ((module_path is not None) and (name is not None))),\
+            "Either space or (module_path, name) must be provided"
+
+        if space is None:
+            for p in ["modele_path", "name"]:
+                assert(isinstance(p, str)),\
+                    "Parameter '{}' must be of str type".format(p)
+
+            assert(os.path.isfile(module_path)),\
+                "Parameter 'module_path' must be a valid file"
+
+            module, extension = os.path.splitext(os.path.basename(module_path))
+            assert(extension == ",py"),\
+                "Parameter 'space' must be read from a python file"
+
+            sys.path.insert(module_path)
+
+            try:
+                from module import name as space
+            except ImportError:
+                err = "Invalid space location or name"
+                self._logger.error(err)
+                raise Exception(err)
+
+        assert(isinstance(space, pyll.base.Apply)),\
+            "Parameter 'space' must be of hyperopt space type"
+
+        self._space = space
+        self._logger.info("Attached parameter distribution space")
+        self._space_attached = True
+
+    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
+            -> np.ndarray:
+        '''
+        Converts an DataFrame to an numpy array.
+        '''
+        if isinstance(x, np.ndarray):
+            return x
+
+        elif (isinstance(x, pd.core.frame.DataFrame))\
+                or (isinstance(x, pd.core.series.Series)):
+            return x.values
+
+        else:
+            e = 'The argument must be a numpy array or a pandas DataFrame'
+            self._logger.critical(e)
+            raise ValueError(e)
+
+    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
+                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
+                    X_val: (pd.DataFrame, np.ndarray) = None,
+                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
+                    cv: (list, int) = None):
+        '''
+        :param array X_train: data on which
+            machine learning pipelines are trained
+
+        :param array y_train: optional, vector with targets,
+            (not all algorithms require a targets)
+
+        :param array X_val: optional, validation data.
+            When not provided, cross-validated value
+            of the cost_func is calculated.
+
+        :param array y_val: optional, validation targets
+
+        :param list cv: list of tuples containing
+            train and validation indices or an integer representing
+            the number of folds for a random split of data
+            during cross-validation
+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
+        '''
+
+        X_train = self._convert_to_array(X_train)
+        if y_train is not None:
+            y_train = self._convert_to_array(y_train)
+
+        if X_val is not None:
+            if cv is not None:
+                self._logger.warning(("Both validation set and cv object "
+                                      "are set. Validation score will be "
+                                      "calculated on the validation set!"))
+
+            X_val = self._convert_to_array(X_val)
+
+            train_inds = list(range(len(X_train)))
+            val_inds = list(range(len(X_train),
+                                  len(X_train) + len(X_val)))
+
+            # cost is evaluated with a cross validation function
+            # that accepts an array and a cv object with
+            # indices of the fold splits.
+            # Here we create a trivial cv object
+            # with one validation split.
+            self._cv = [(train_inds, val_inds)]
+            self._X = np.concatenate([X_train, X_val])
+
+            if y_train is not None:
+                if y_val is None:
+                    err = "Argument y_val must be provided"
+                    self._logger.critical(err)
+                    raise ValueError(err)
+                else:
+                    y_val = self._convert_to_array(y_val)
+                    self._y = np.concatenate([y_train, y_val])
+            else:
+                self._y = None
+        else:
+            if cv is None:
+                self._logger.warning(("Neither validation set nor cv object "
+                                      "are set. Validation score will be "
+                                      "calculated on 5 randomly "
+                                      "splitted folds."))
+
+            self._X = X_train
+            self._y = y_train
+            self._cv = cv
+
+        self._logger.info("Attached data")
+        self._data_attached = True
+
+    def _evaluate(self, pipeline: Pipeline) -> dict:
+        '''
+        This method is called in _objective.
+
+        Calculates the cost on the attached data.
+        This function can be overriden, when the cost
+        needs to be calculated differently,
+        for example with a tensorflow model.
+
+        :param Pipeline pipeline: machine learning pipeline
+            that will be evaluated with cross-validation
+
+        :output: dictionary with the aggregated
+            cross-validation score and
+            the score variance.
+        '''
+
+        scores = cross_validate(estimator=pipeline,
+                                X=self._X,
+                                y=self._y,
+                                cv=self._cv or 5,
+                                scoring=make_scorer(self._cost_func),
+                                error_score=np.nan)
+
+        return {'value': self._averaging_func(scores['test_score']),
+                'variance': np.var(scores['test_score'])}
+
+    def _objective(self, space_element: dict) -> dict:
+        '''
+        This method is called in search_for_best_pipeline
+        inside the hyperopt fmin method.
+
+        Uses _evaluate method.
+
+        It must take as input a space element
+        and produce an output in the form of dictionary
+        with 2 obligatory values loss and status
+        (STATUS_OK or STATUS_FAIL). Other
+        values in the output are optional and can be
+        accessed later through the trials object.
+
+        :Warning: fmin minimizes the loss,
+        when _evaluate returns a value to be maximized,
+        it should be multiplied by -1 to obtain loss.
+
+        :param dict space_element: must contain keys
+            name (with the name of the pipeline),
+            pipeline (Pipeline object),
+            params (dict of pipeline params)
+
+        :output: dictionary with keys
+            loss (minimized value),
+            status with values STATUS_OK or STATUS_FAIL
+            uderstood by hyperopt,
+            score (equal to loss or -loss),
+            score_variance,
+            timestamp (end of execution),
+            train_time: execution time
+        '''
+        assert(isinstance(space_element, dict) and
+               set(['name', 'pipeline', 'params']) <= space_element.keys())
+
+        assert(isinstance(space_element['name'], str) and
+               isinstance(space_element['pipeline'], Pipeline) and
+               isinstance(space_element['params'], dict))
+
+        start_time = time.time()
+
+        if not self._data_attached:
+            raise Exception(("Data must be attached in order "
+                             "in order to effectuate the best"
+                             "pipeline search"))
+
+        self._run_number += 1
+
+        pipeline = space_element['pipeline']
+        params = space_element['params']
+        pipeline.set_params(**params)
+
+        self._logger.info(("Run number {0}: "
+                           "Current score is {1}: "
+                           "Training pipeline {2} "
+                           "with parameters: {3}. ").format(
+                             self._run_number,
+                             self._best_score,
+                             space_element['name'],
+                             params))
+
+        try:
+            score_stats = self._evaluate(pipeline)
+            assert(not np.isnan(score_stats["value"])),\
+                "Returned null score"
+
+            if self._run_number % self._backup_trials_freq == 0:
+                self._backup_trials()
+
+            if (self._best_score != self._best_score) or\
+                self._score_factor*score_stats["value"] <\
+                    self._score_factor*self._best_score:
+
+                self._logger.info("Score got better, new best score is: {}"
+                                  .format(score_stats["value"]))
+
+                self._best_score = score_stats['value']
+
+                self._backup_trials()
+
+            end_time = time.time()
+
+            return {'loss': self._score_factor * score_stats["value"],
+                    'status': STATUS_OK,
+                    'score': score_stats["value"],
+                    'score_variance': score_stats["variance"],
+                    'timestamp': datetime.datetime.today(),
+                    'train_time': end_time - start_time}
+
+        except Exception as e:
+
+            self._logger.warning("Trial failed with error {}".format(e))
+
+            return {'loss': np.nan,
+                    'status': STATUS_FAIL,
+                    'score': np.nan,
+                    'score_variance': np.nan,
+                    'timestamp': datetime.datetime.today(),
+                    'train_time': np.nan}
+
+    def search_for_best_pipeline(self,
+                                 niter: int,
+                                 algo: callable = tpe.suggest):
+        '''
+        Method performing the search of the best pipeline in the given space.
+        Calls fmin function from the hyperopt library to minimize the output of
+        _objective.
+
+        :params int niter: number of search iterations
+        :param callable algo: now can only take values tpe for a tree-based
+            random search or random for random search
+        '''
+        assert(self._space_attached),\
+            "Space must be attach to be able to retrieve this information."
+
+        assert(isinstance(niter, int)),\
+            "Parameter 'niter' must be of int type"
+
+        # right now only two algorithms are provided by
+        assert(algo in [tpe.suggest, rand.suggest]),\
+            ("Parameter 'algo' can be now only tpe or random. "
+             "If other algorithms have been developped by "
+             "by hyperopt, plased add them to the list.")
+
+        try:
+            self._logger.info(("Starting {0} iterations of search "
+                               "additional to {1} previous"
+                               .format(niter, len(self._trials.trials))))
+
+            best = fmin(fn=self._objective,
+                        space=space,
+                        algo=algo,
+                        trials=self._trials,
+                        max_evals=len(self._trials.trials) + niter)
+
+            # print('AAAA', str(niter))
+
+            self._logger.info(
+                    "Best score is {0} with variance {1}"
+                    .format(
+                     self._trials.best_trial["result"]["score"],
+                     self._trials.best_trial["result"]["score_variance"]))
+
+            self._logger.info(("Finished {0} iterations of search.\n"
+                               "Best parameters are:\n {1} ")
+                              .format(niter,
+                                      space_eval(space, best)))
+
+            self._backup_trials()
+
+        except Exception as e:
+            raise ValueError(("Failed to select best "
+                             "pipeline! Exit with error: {}").format(e))
+
+    @property
+    def best_trial_score(self) -> float:
+        '''
+        '''
+        if len(self._trials.trials) > 0:
+            return self._trials.best_trial["result"]["score"]
+        else:
+            return np.nan
+
+    @property
+    def best_trial_score_variance(self) -> float:
+        '''
+        '''
+        if len(self._trials.trials) > 0:
+            return self._trials.best_trial["result"]["score_variance"]
+        else:
+            return np.nan
+
+    @property
+    def best_trial_pipeline(self) -> Pipeline:
+        '''
+        '''
+        assert(self._space_attached),\
+            "Space must be attach to be able to retrieve this information."
+
+        if len(self._trials.trials) > 0:
+
+            return space_eval(
+                    space,
+                    {k: v[0] for k, v in
+                     self._trials.best_trial['misc']['vals'].items()
+                     if len(v) > 0})["pipeline"]
+        else:
+            err = ("Trials object is empty. "
+                   "Best pipeline cannot be returned")
+
+            self._logger.error(err)
+            raise Exception(err)
+
+    def _ith_trial_loss(self, i: int) -> float:
+        '''
+        '''
+        if len(self._trials.trials) >= i:
+            return self._trials.trials[i]['result']['loss']
+        else:
+            return np.nan
+
+    def _ith_trial_element(self, i: int, name: str) -> object:
+        '''
+        '''
+        assert(self._space_attached),\
+            "Space must be attach to be able to retrieve this information."
+
+        if len(self._trials.trials) >= i:
+            return space_eval(self._space,
+                              {k: v[0] for k, v in
+                               self._trials.trials[i]['misc']['vals']
+                               .items() if len(v) > 0})[name]
+
+    def _ith_trial_pipeline(self, i: int) -> Pipeline:
+        '''
+        '''
+        return self._ith_trial_element(i=i, name='pipeline')
+
+    def _ith_trial_name(self, i: int) -> str:
+        '''
+        '''
+        return self._ith_trial_element(i=i, name='name')
+
+    def _ith_trial_params(self, i: int) -> dict:
+        '''
+        '''
+        return self._ith_trial_element(i=i, name='params')
+
+    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
+        '''
+        '''
+        if len(self._trials.trials) >= i:
+            return self._trials.trials[i]["result"]["timestamp"]
+
+    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
+        '''
+        Returns the list of n best pipelines
+        documented in trials
+        '''
+        if len(self._trials.trials) > 0:
+            if losses is None:
+                losses = [self._ith_trial_loss(i)
+                          for i in range(len(self._trials.trials))]
+
+            best_n_indices = [losses.index(l)
+                              for l in sorted(list(set(losses)))[:n]]
+
+            return [self._ith_trial_pipeline(i) for i in best_n_indices]
+        else:
+            err = ("Trials object is empty. "
+                   "Best pipeline cannot be returned")
+
+            self._logger.error(err)
+            raise Exception(err)
+
+    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
+        '''
+        Returns a dictiionry where keys are pipeline names,
+        and values are lists of best pipelines with this name
+        '''
+        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
+
+        if len(self._trials.trials) > 0:
+
+            best_pipelines_per_type = {}
+            names = [self._ith_trial_name(i)
+                     for i in range(len(self._trials.trials))]
+
+            for nm in names:
+                losses = [self._ith_trial_loss(i)
+                          for i in range(len(self._trials.trials))
+                          if self._ith_trial_name(i) == nm]
+
+                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
+                                                        n=n,
+                                                        losses=losses)
+
+            return best_pipelines_per_type
+
+        else:
+            err = ("Trials object is empty. "
+                   "Best pipeline cannot be returned")
+
+            self._logger.error(err)
+            raise Exception(err)
+
+    def write_trials_documentation(self, path: str = None):
+        '''
+        Saves an excel file with pipeline names, scores,
+        parameters, and timestamps.
+        '''
+        if len(self._trials.trials) > 0:
+            path = path or "hyperopt_trials_documentation.xlsx"
+
+            assert(isinstance(path, str)),\
+                "Parameter 'path' must be of string type"
+
+            self._assert_valid_directory(path)
+
+            names = [self._ith_trial_name(i)
+                     for i in range(len(self._trials.trials))]
+            scores = [self._score_factor*self._ith_trial_loss(i)
+                      for i in range(len(self._trials.trials))]
+            params = [self._ith_trial_params(i)
+                      for i in range(len(self._trials.trials))]
+            timestamps = [self._ith_trial_timestamp(i)
+                          for i in range(len(self._trials.trials))]
+
+        else:
+            names = []
+            scores = []
+            params = []
+            timestamps = []
+
+        pd.DataFrame({"name": names,
+                      "score": scores,
+                      "params": params,
+                      "timestamp": timestamps})\
+          .to_excel(path)
+
+
+if __name__ == '__main__':
+
+    from sklearn.metrics import roc_auc_score, make_scorer
+    from xgboost import XGBClassifier
+    from sklearn.svm import SVC
+    from sklearn.feature_selection import SelectKBest
+    from sklearn.decomposition import PCA
+    from sklearn.datasets import load_iris
+    from pprint import pprint
+
+    data = load_iris()
+    X = pd.DataFrame(data.data)
+    y = pd.Series(data.target)
+    # produce a binory variable
+    y = (y == 2).astype(int)
+    del data
+    gc.collect()
+
+    # SPACE DEFINITION ########################################
+    # (can be moved to a separate python script)
+
+    """
+    A search space must be a list of dictionaries.
+    Each dictionry must have keys:
+        name (pipeline name or type),
+        pipeline (instance of sklearn.pipeline.Pipeline),
+        params (dictionary of distributions for the parameters of
+                the pipeline that we want to tune)
+
+    Here we have a space that consists of two dictionaries:
+    KBEST_XGBOOST and PCA_SVC
+    """
+    space = []
+
+    pipeline_dist_1 = {}
+    pipeline_dist_1["name"] = "KBEST_XGBOOST"
+
+    """
+    A pipeline consists of steps (tuples).
+    Each step has a name and an algorithm.
+    This pipeline, as a first step performs
+    feature selection with SelectKBest and
+    as a second step evaluates a machine learning algo (xgboost).
+
+    Like all sklearn algorithms, a Pipeline has methods
+    fit, predict, set_params, get_params
+    """
+    pipeline_dist_1["pipeline"] = Pipeline([
+                                     ('kbest', SelectKBest()),
+                                     ('xgb', XGBClassifier())
+                                     ])
+    """
+    Pipeline parameter dictionaries must be of the form:
+    {'kbest__k': 3, xgb__n_estimators: 20},
+    each parameter name consists of the step name, __, and parameter name.
+
+    Here, instead of values, the parameter names are followed
+    by hyperopt distributions.
+    Each hyperopt distribution also must have a name,
+    due to hyperopt functionality.
+
+    Here, we set the hyperopt distribution name to the step name,
+    but it does not have to be so. Hyperopt distribution names
+    must be different for different elements of the space.
+    """
+
+    pipeline_dist_1["params"] = {
+            'kbest__k': hp.choice('kbest__k', range(1, 5)),
+
+            'xgb__n_estimators':
+            50 + hp.randint('xgb__n_estimators', 50),
+
+            "xgb__learning_rate":
+            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
+            }
+
+    space.append(pipeline_dist_1)
+
+    pipeline_dist_2 = {}
+    pipeline_dist_2["name"] = "PCA_SVC"
+
+    pipeline_dist_2["pipeline"] = Pipeline([
+                                     ('pca', PCA()),
+                                     ('svc', SVC(gamma="scale"))
+                                     ])
+
+    pipeline_dist_2["params"] = {
+            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
+
+            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
+            }
+
+    space.append(pipeline_dist_2)
+
+    space = hp.choice('pipelines', space)
+
+    # TESTING ##########################################################
+
+    trials_path = 'TEST_hyperopt_trials.pkl'
+
+    doc_path = 'TEST_hyperopt_doc.xlsx'
+
+    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
+                                       greater_is_better=True,
+                                       trials_path=trials_path)
+
+    hp_obj.attach_data(X_train=X, y_train=y)
+
+    hp_obj.attach_space(space=space)
+
+    hp_obj.search_for_best_pipeline(niter=10)
+
+    print('\n', '='*20, 'TESTING', '='*20)
+
+    print('\n', 'Best score:', hp_obj.best_trial_score)
+
+    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
+
+    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
+
+    print('\n', 'Best 3 pipelines: \n')
+    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
+
+    print('\n', 'Best pipeline per type: \n')
+    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
+
+    hp_obj.write_trials_documentation(path=doc_path)
+
+    # os.remove(doc_path)
+    # os.remove(trials_path)

+ 211 - 0
db_handlers/MongodbHandler.py

@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Created on Mon Sep 16 13:27:44 2019
+
+@author: oskar
+@description: Implementation of a database handler for abstraction of the mongodb.
+"""
+
+
+import json
+import simplejson
+import sys
+import os
+import jsonref
+
+from copy import deepcopy
+from pymongo import MongoClient
+import pandas as pd
+import numpy as np
+
+sys.path.append(os.getcwd())
+from libraries.log import Log
+from libraries.configuration import default as cfg
+
+class MongodbHandler:
+
+    '''
+
+    '''
+
+    def __init__(self, database_url: str = cfg['MONGO_DB']['URI'],
+                 database_name: str = cfg['MONGO_DB']['DATABASE_NAME']):
+        '''
+        :param str database_url: Url for the mongodb database
+        :param str database_name: Name of the database the database handler should handle
+        '''
+        assert(isinstance(database_url, str)),\
+            "Parameter 'database_url' must be a string type"
+        assert(isinstance(database_name, str)),\
+            "Parameter 'database_name' must be a string type"
+
+        self._log = Log("\nMongodbHandler script")
+
+        self._log.info('Mongodb Handler has been initialized')
+        # Connect to the MongoDB
+        self._client = MongoClient(database_url)
+        # Connect to the oebb_db database, or create it if it doesnt exist.
+        self._database = self._client[database_name]
+
+    def _read_schema(self, schema_path: str) -> dict:
+        '''
+        :param str schema_path: path to the schema file.
+        '''
+
+        assert(isinstance(schema_path, str)),\
+            "Parameter 'schema_path must be a string type"
+
+        with open(schema_path) as json_file:
+            schema = json.load(json_file)
+
+        if 'definitions' in schema:
+            schema = self._dereference_schema(schema)
+
+        return schema
+
+    def _dereference_schema(self, schema: dict) -> dict:
+        '''
+        :param dict schema: dictionary containing a schema which uses references.
+        '''
+
+        assert(isinstance(schema, dict)),\
+            "Parameter 'schema' must be a dictionary type"
+
+        schema = jsonref.loads(str(schema).replace("'", "\""))
+        schema = deepcopy(schema)
+        schema.pop('definitions', None)
+        return schema
+
+    def set_collection_schema(self, collection_name: str, schema_path: str,
+                              validation_level: str = 'moderate',validation_action: str = 'error'):
+        '''
+        :param str collection_name: name on the collection for which the schema will be set.
+        :param str schema_path: path to the schema file.
+        :param str validation_level: level of validation done by the mongodb.
+        :param str validation_action: what will happen upon validation error, warning or error message.
+        '''
+        assert(isinstance(collection_name, str)),\
+            "Parameter 'collection_name' must be a string type"
+        assert(isinstance(schema_path, str)),\
+            "Parameter 'schema_path' must be a string type"
+        assert(isinstance(validation_level, str)),\
+            "Parameter 'validation_lever' must be a string type"
+        assert(isinstance(validation_action, str)),\
+            "Parameter 'validation_action' must be a string type"
+
+        schema = self._read_schema(schema_path)
+
+        command = {
+                    'collMod': collection_name,
+                    'validator': {
+                        '$jsonSchema': schema
+                    },
+                    'validationLevel': validation_level,
+                    'validationAction': validation_action
+                    }
+
+        self._database.command(command)
+
+    def create_collection(self, collection_name):
+        '''
+        :param str collection_name: name of the collection to be created.
+        '''
+
+        assert(isinstance(collection_name, str)),\
+            "Parameter 'collection_name' must be a string type"
+
+        if collection_name not in self._database.list_collection_names():
+            self._log.info(("Collection '{}' has been created").format(collection_name))
+            return self._database.create_collection(collection_name)
+        else:
+            self._log.info(("Collection '{}' already exists").format(collection_name))
+            return self._database[collection_name]
+
+    def insert_data_into_collection(self, data: (dict, list, np.ndarray, pd.DataFrame, pd.Series),
+                                    collection_name: str,
+                                    ordered: bool = False):
+        '''
+        :param dict data: dictionary containing the data to be inserted in the collection
+        :param pymongo.database.Collection collection: The collection the data will be added to.
+        '''
+
+        allowed_types = (dict, list, np.ndarray, pd.DataFrame, pd.Series)
+
+        assert(isinstance(data, allowed_types)),\
+            "Parameter 'data' is of invalid type"
+
+        if isinstance(data, np.ndarray):
+            data = pd.DataFrame(data)
+
+        if isinstance(data, pd.DataFrame):
+
+            data = simplejson.loads(data.to_json(orient="records",
+                                                 date_format="iso"))
+
+        elif isinstance(data, pd.Series):
+
+            data = simplejson.loads(data.to_json(date_format="iso"))
+
+        if (len(data) == 1) or (isinstance(data, dict)):
+
+            if isinstance(data, pd.DataFrame) and (len(data) == 1):
+                data = data.iloc[0]
+
+            self._database[collection_name].insert_one(data)
+        else:
+            self._database[collection_name].insert_many(data, ordered=ordered)
+
+        self._log.info(('Data has been inserted into the {} collection').format(collection_name))
+
+    def create_collection_and_set_schema(self, collection_name: str, schema_path: str):
+        '''
+        :param str collection_name: name of the collection to be created.
+        :param str schema_path: path to the schema file.
+        '''
+        assert(isinstance(collection_name, str)),\
+            "Parameter 'collection_name' must be a string type"
+        assert(isinstance(schema_path, str)),\
+            "Parameter 'schema_path' must be a string type"
+
+        self.create_collection(collection_name)
+        self.set_collection_schema(collection_name=collection_name, schema_path=schema_path)
+
+    def query_data_and_generate_dataframe(self, collection_name: str, attribute: str = None,
+                                          attribute_value: str = None, comparison_operator: str = '$eq'):
+        '''
+
+        '''
+        if attribute is None or attribute_value is None:
+            data = self._database[collection_name].find()
+        else:
+            data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}})
+
+        df = pd.DataFrame(list(data))
+        df.set_index('radsatznummer', inplace=True)
+        return df
+
+
+if __name__ == "__main__":
+
+    log = Log("Test MongodbHandler:")
+
+    log.info('Script started')
+
+    db_handler = MongodbHandler()
+
+    # Create a colleciton for the wheelsets and give it its schema.
+    for schema_path in [
+            os.path.join(".", "mongo_schema", "schema_wheelsets.json"),
+            os.path.join(".", "mongo_schema", "schema_process_instances.json"),
+            os.path.join(".", "mongo_schema", "schema_componets.json")]:
+
+        if os.path.isfile(schema_path):
+
+            collection_name = os.path.basename(schema_path).lstrip("_schema").split(".")[0]
+
+            db_handler.create_collection_and_set_schema(collection_name, schema_path)
+
+    log.info(("Existing databases: {}, Collection in OEBB database {}")\
+             .format(db_handler._client.list_database_names(), db_handler._database.list_collection_names()))

+ 595 - 0
db_handlers/SQLHandler.py

@@ -0,0 +1,595 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Sep 18 16:20:50 2018
+
+@author: tanya
+"""
+
+import os
+import sys
+import re
+import sqlalchemy
+import sqlparse
+import pandas as pd
+import warnings
+
+sys.path.append(os.getcwd())
+
+
+class SQLHandler:
+    '''
+    Resembles methods for executing sql queries
+    with different dabase connectors.
+    Remark:in each method we force new opening and
+    closing of a database connection,
+     this avoids errors when parallelizing with multiprocessing.
+    '''
+
+    def __init__(self, db_uri: str = None,
+                 is_case_insensitive: bool = False):
+        '''
+        :param str db_uri:
+            of form
+            <sqlalchemy_dialect//user:password@host:port/dbname?charset=utf8&local_infile=1>
+
+         sqlalchemy dialects:
+             for mysql : mysql+pymysql
+             for db2: ibm_db_sa
+        '''
+
+        from libraries.log import Log
+        from libraries.configuration import default as cfg
+        from sqlalchemy_utils import database_exists, create_database
+
+        self._log = Log(name='SQLHandler')
+
+        if db_uri is None:
+            db_uri = cfg["SQL_DB"]["URI"]
+
+        assert(isinstance(db_uri, str)),\
+            "Parameter 'db_uri' must be of type str"
+
+        assert(re.match(r'.+://.+:(.+)?@.+:.+/.+', db_uri) is not None),\
+            ('database url does not match the pattern: '
+             'sqlalchemy_dialect//user:password@host:port/dbname')
+
+        self._db_uri = db_uri
+
+        engine = sqlalchemy.create_engine(self._db_uri)
+
+        if not database_exists(engine.url):
+            create_database(engine.url)
+
+        query = "CREATE DATABASE IF NOT EXISTS {}"\
+            .format(self._connection_params["db"])
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            engine.execute(query)
+
+        assert(isinstance(is_case_insensitive, bool)),\
+            "Parameter 'is_case_sensetive' must of type bool"
+
+        if 'ibm' in db_uri and not is_case_insensitive:
+            raise Exception('Ibm db2 is case insensitive')
+
+        self._is_case_insensitive = is_case_insensitive
+
+        self._engine = sqlalchemy.create_engine(self._db_uri)
+
+    @property
+    def _connection_params(self) -> dict:
+        '''
+        return: connection parameters like user,
+        password, host, port, and database name
+        rtype: dict
+        '''
+        try:
+            connection_params = {}
+
+            connection_params['user'], connection_params['password'] =\
+                self._db_uri.split('//')[1]\
+                            .split('@')[0]\
+                            .split(':')
+
+            connection_params['host'], connection_params['port'] =\
+                self._db_uri.split('//')[1]\
+                            .split('@')[1]\
+                            .split('/')[0]\
+                            .split(':')
+
+            connection_params['db'] = self._db_uri.split('/')[-1]\
+                                                  .split('?')[0]
+
+            return connection_params
+
+        except Exception as e:
+            err = ("Could not parse connection parameters."
+                   "Finished with error {}")\
+                   .format(e)
+
+            self._log.error(err)
+            raise Exception(err)
+
+    def drop_database(self):
+        '''
+        '''
+        database = self._connection_params["db"]
+        self.execute("DROP DATABASE IF EXISTS {}".format(database))
+
+    @property
+    def _db_metadata(self) -> dict:
+        '''
+        Returns a sql-dialect specific information like information schema
+        and columnames in information_schema.tables and
+        information_schema.columns
+        For ibm databases, information_schema is set to syscat,
+        else it is set to information_schema
+        If these default values do not exist in the given database,
+        the output of the method is set to None
+
+        :return: dictionary with information_schema, schema_col,
+            table_col, column_col, default_schema
+        '''
+
+        db_metadata = {}
+
+        if 'ibm' in self._db_uri:
+            db_metadata['information_schema'] = 'syscat'
+            db_metadata['schema_col'] = 'tabschema'
+            db_metadata['table_col'] = 'tabname'
+            db_metadata['column_col'] = 'colname'
+            db_metadata['default_schema'] =\
+                self._connection_params['user'].upper()
+        else:
+            db_metadata['information_schema'] = 'information_schema'
+            db_metadata['schema_col'] = 'TABLE_SCHEMA'
+            db_metadata['table_col'] = 'TABLE_NAME'
+            db_metadata['column_col'] = 'COLUMN_NAME'
+            db_metadata['default_schema'] =\
+                self._connection_params['db']
+
+        # check if it worked to create metadata
+        try:
+            query = """SELECT *
+                       FROM {}.tables
+                       LIMIT 1
+                    """.format(db_metadata['information_schema'])
+            self.execute(query)
+
+        except Exception as e:
+            self._log.error(e)
+            db_metadata = None
+
+        return db_metadata
+
+    def execute(self, query):
+        '''
+        Executes an sql-queries.
+        Remark: queries like CREATE, DROP, SELECT work
+        for majority of sqlalchemy dialects.
+         queries like SHOW TABLES, LOAD DATA, and using
+         INFORMATION_SCHEMA are mysql specific and might
+         not exist in a different dialect.
+
+        :param str query:
+        '''
+        connection = self._engine.connect()
+        transaction = connection.begin()
+
+        errors = []
+
+        # in the case of multi-query execute each query
+        for sub_query in sqlparse.split(query):
+            if len(sub_query) > 0:
+                try:
+                    connection.execute(sub_query, multi=True)
+
+                except Exception as e:
+                    errors.append(str(e))
+
+        if len(errors) > 0:
+            err = ('Could not execute some of the queries. '
+                   'Obtained exceptions: {}'
+                   .format('\n'.join(errors)))
+
+            self._log.error(err)
+            raise Exception(err)
+
+        transaction.commit()
+        connection.close()
+
+    def execute_query_from_file(self, filename: str):
+        '''
+        '''
+        with open(filename, 'r') as f:
+            query = f.read()
+
+        self.execute(query)
+
+    def get_tablenames(self, schema: str = None, query: str = None):
+        '''
+        '''
+        if (self._db_metadata is None) and (query is None):
+            raise Exception('Please specify the query')
+
+        else:
+            try:
+                if query is None:
+                    schema_or_default_schema =\
+                        self._db_metadata['default_schema']\
+                        if schema is None else schema
+
+                    query = """SELECT DISTINCT {0}
+                               FROM {1}.tables
+                               WHERE {2} = '{3}'
+                            """.format(
+                            self._db_metadata['table_col'],
+                            self._db_metadata['information_schema'],
+                            self._db_metadata['schema_col'],
+                            schema_or_default_schema)
+
+                tables = self.read_sql_to_dataframe(query).iloc[:, 0].tolist()
+                return tables
+
+            except Exception as e:
+                err = ("Could not get tablenames"
+                       "Finished with error {}".format(e))
+
+                self._log.error(err)
+                raise Exception(err)
+
+    def check_if_table_exists(self, tablename: str,
+                              schema: str = None,
+                              query: str = None):
+        '''
+        Tries to retrieve table information from database with given query.
+        If this does not work, tries to select one row from the given table,
+        if this fails, assumes that the table does not exist.
+
+        :param str tablename:
+        :param str schema:
+        :param str query: if not specified, tries to find
+            tablename in information_schema specified in _db_metadata.
+        :return: if the table exists or not
+        :rtype: bool
+        '''
+        if self._is_case_insensitive:
+            tablename = tablename.upper()
+
+        try:
+            tablenames = self.get_tablenames(schema=schema, query=query)
+
+            table_exists = (tablename in tablenames)
+
+        except Exception as e:
+            self._log.warning(('Could not execute query to retrieve table '
+                               'information. Trying to execute a'
+                               'select statement. '
+                               'Got exeption {}').format(e))
+            try:
+                query = """SELECT *
+                           FROM {0}{1}
+                           LIMIT 1
+                        """.format('' if schema is None else schema + '.',
+                                   tablename)
+
+                self.execute(query)
+
+                table_exists = True
+
+            except Exception as e:
+                self._log.warning(('Failed to select from {0}. '
+                                   'Finished with error {1}'
+                                   'Conclusion: table does not exist')
+                                  .format(tablename, e))
+
+                table_exists = False
+
+        return table_exists
+
+    def create_schema(self, schema: str, query: str = None):
+        '''
+        Creates a schema if it does not exist, else does nothing
+
+        :param str schema:
+        :param str query: if None trying to read schemas from
+            information_schema specified in db_metadata
+        '''
+        if (query is None):
+
+            if self._db_metadata is None:
+                raise Exception('Please specify query')
+            else:
+                query = """SELECT DISTINCT {0}
+                           FROM {1}.tables""".format(
+                              self._db_metadata['schema_col'],
+                              self._db_metadata['information_schema'])
+
+        try:
+            schemas = self.read_sql_to_dataframe(query).iloc[:, 0].tolist()
+        except Exception as e:
+            err = ("Could not retrieve the list of schemas"
+                   "from the database. Finished with error {}"
+                   .format(e))
+
+            self._log.error(err)
+            raise Exception(err)
+
+        if schema not in schemas:
+            self.execute("CREATE SCHEMA {}".format(schema))
+
+    def drop_table_if_exists(self, tablename: str,
+                             schema: str = None,
+                             query: str = None):
+        '''
+        :param str tablename:
+        :param str schema:
+        :param str query: if not specified, default value is "DROP TABLE"
+        '''
+        if self._is_case_insensitive:
+            tablename = tablename.upper()
+
+        schema = '' if schema is None else schema + '.'
+
+        if query is None:
+            query = "DROP TABLE {0}{1};".format(schema, tablename)
+
+        try:
+            if self.check_if_table_exists(tablename):
+                self.execute(query)
+
+        except Exception as e:
+            err = ("Could not drop the table {0} ."
+                   "Finished with error {1}"
+                   .format(tablename, e))
+
+            self._log.error(err)
+            raise Exception(err)
+
+    def get_column_names(self, tablename: str,
+                         schema: str = None,
+                         query: str = None):
+        '''
+        Tries to retrieve column information from database with given query.
+        If this does not work, tries to select one row from the given table.
+
+        :param str tablename:
+        :param str schema:
+        :param str query: if not specified, tries to select column
+            names in the information_schema specified in db_metadata
+        '''
+        if self._is_case_insensitive:
+            tablename = tablename.upper()
+
+        if not self.check_if_table_exists(tablename=tablename,
+                                          schema=schema):
+
+            err = "Table {} does not exist".format(tablename)
+            self._log.error(err)
+            raise Exception(err)
+
+        try:
+            if query is None:
+                if self._db_metadata is None:
+                    raise Exception('Please specify the query')
+
+                else:
+                    schema_or_default_schema =\
+                        self._db_metadata['default_schema']\
+                        if schema is None else schema
+
+                    query = """SELECT DISTINCT {0}
+                               FROM {1}.columns
+                               WHERE {2} = '{3}'
+                               AND {4} = '{5}'
+                            """.format(
+                            self._db_metadata['column_col'],
+                            self._db_metadata['information_schema'],
+                            self._db_metadata['schema_col'],
+                            schema_or_default_schema,
+                            self._db_metadata['table_col'],
+                            tablename)
+
+            colnames = [c.lower() for c in
+                        self.read_sql_to_dataframe(query).iloc[:, 0].tolist()]
+
+        except Exception as e:
+            self._log.warn((
+                'Could not select columns from '
+                'informational schema. Trying to '
+                'load the table into a dataframe and selec column names.'
+                'Obtained exception {}').format(e))
+
+            query = """SELECT *
+                       FROM {0}{1}
+                       LIMIT 1
+                    """.format('' if schema is None else schema + '.',
+                               tablename)
+
+            data = self.execute(query)
+            colnames = data.columns.tolist()
+
+        return colnames
+
+    def load_csv_to_db(self, filename: str,
+                       tablename: str,
+                       schema: str = None,
+                       query: str = None,
+                       **kwargs):
+        '''
+        Tries to load data from csv file to database with a given query.
+        If this does not work, tries to load data from csv to a
+        pandas dataframe first, and then write it to the database.
+
+        :param str filename:
+        :param str tablename:
+        :param str schema:
+        :param str query: if not specified, tries to use
+        LOAD DATA LOCAL INFILE query
+        '''
+
+        if not self.check_if_table_exists(tablename=tablename,
+                                          schema=schema):
+
+            err = ('Table {} test does not exit.'
+                   'Please create it first').format(tablename)
+            self._log.error(err)
+            raise Exception(err)
+
+        else:
+            try:
+                if query is None:
+                    query = """LOAD DATA LOCAL INFILE '{0}'
+                               INTO TABLE {1}{2}
+                               COLUMNS TERMINATED BY ','
+                               OPTIONALLY ENCLOSED BY '"'
+                               LINES TERMINATED BY '\r\n'
+                               IGNORE 1 LINES
+                               ({3})
+                               ;""".format(
+                                   filename,
+                                   '' if schema is None else schema + '.',
+                                   tablename,
+                                   ','.join(self.get_column_names(tablename)))
+
+                self.execute(query)
+
+            except Exception as e:
+                err = ("Could not load the file {0} "
+                       "to the table {1} ."
+                       "Finished with error {2}")\
+                       .format(filename, tablename, e)
+
+                self._log.error(err)
+                raise Exception(err)
+
+    def read_sql_to_dataframe(self, query: str, **read_sql_kwargs):
+        '''
+        :param str query: normally a SELECT sql statement
+        :param read_sql_kwargs: additional arguments to pandas read_sql method
+        :return: selected data
+        :rtype: DataFrame
+        '''
+        try:
+            connection = self._engine.connect()
+
+            data = pd.read_sql(sql=query,
+                               con=connection,
+                               **read_sql_kwargs)
+
+            connection.close()
+            return data
+
+        except Exception as e:
+            err = ("Could not read the query to a dataframe. "
+                   "Finished with error {}").format(e)
+
+            self._log.error(err)
+            raise Exception(err)
+
+    def read_table(self, tablename: str,
+                   schema: str = None,
+                   **read_sql_kwargs):
+        '''
+        :param str tablename:
+        :param str schema:
+        :param read_sql_kwargs: additional arguments to pands read_sql_method
+        :return: selected table
+        :rtype: DataFrame
+        '''
+        schema = '' if schema is None else schema + '.'
+
+        try:
+            return self.read_sql_to_dataframe(
+                    query="SELECT * FROM {0}{1};".format(schema, tablename),
+                    **read_sql_kwargs)
+        except Exception as e:
+            err = ("Could not read the table {0} to a dataframe. "
+                   "Finished with error {1}").format(tablename, e)
+
+            self._log.error(err)
+            raise Exception(err)
+
+    def append_to_table(self, data: pd.DataFrame,
+                        tablename: str,
+                        schema: str = None,
+                        to_sql_kwargs={'index': False}):
+        '''
+        :param DataFrame data: data to append
+        :param str tablename: table where data is appended
+        :param str schema:
+        :param dict to_sql_kwargs: additional arguments to pandas to_sql method
+        '''
+        if schema is not None:
+            self.create_schema(schema)
+
+        try:
+            connection = self._engine.connect()
+
+            data.to_sql(name=tablename,
+                        schema=schema,
+                        con=connection,
+                        if_exists='append',
+                        **to_sql_kwargs)
+
+            connection.close()
+
+        except Exception as e:
+            err = ("Could append data to the table {0}. "
+                   "Finished with error {1}").format(tablename, e)
+
+            self._log.error(err)
+            raise Exception(err)
+
+    def overwrite_table(self, data: pd.DataFrame,
+                        tablename: str,
+                        schema: str = None,
+                        to_sql_kwargs={'index': False}):
+        '''
+        :param DataFrame data: data to write to dabase
+        :param str tablename: table where data is written
+        :param str schema:
+        :param to_sql_kwargs: additional arguments to pandas to_sql method
+        '''
+
+        if schema is not None:
+            self.create_schema(schema)
+
+        try:
+
+            connection = self._engine.connect()
+
+            data.to_sql(name=tablename,
+                        schema=schema,
+                        con=connection,
+                        if_exists='replace',
+                        **to_sql_kwargs)
+
+            connection.close()
+
+        except Exception as e:
+            err = ("Could overwrite the table {0}. "
+                   "Finished with error {1}").format(tablename, e)
+
+            self._log.error(err)
+            raise Exception(err)
+
+    def draw_er_diagram_from_db(self, diagram_path: str = None,
+                                schema: str = None,
+                                include_tables: list = None):
+        '''
+        '''
+        if diagram_path is None:
+            diagram_path = "erd.png"
+        else:
+            diagram_dir = os.path.dirname(diagram_path)
+            if diagram_dir != "":
+                os.makedirs(diagram_dir, exist_ok=True)
+
+        import eralchemy
+        eralchemy.render_er(self._db_uri,
+                            diagram_path,
+                            schema=schema,
+                            include_tables=include_tables)

BIN
db_handlers/__pycache__/MongodbHandler.cpython-37.pyc


BIN
db_handlers/__pycache__/SQLHandler.cpython-37.pyc


BIN
db_handlers/__pycache__/SQLOperations.cpython-37.pyc


+ 352 - 0
db_migration/DataFrameToCollection.py

@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul 22 11:05:47 2019
+
+@author: tanya
+
+@description: a function to reshape a pandas dataframe to a list of
+(possibly nested) documents with respect to a (json) mongodb schema
+"""
+
+import pandas as pd
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+
+class DataFrameToCollection:
+    '''
+    '''
+    def __init__(self, schema_path: str = None, log_path: str = None):
+        '''
+        '''
+        from libraries.log import Log
+        import json
+
+        self._log = Log("ParseJsonSchema")
+
+        if schema_path is not None:
+
+            if not os.path.isfile(schema_path):
+                err = "JsonSchema not found"
+                self._log.error(err)
+                raise FileNotFoundError(err)
+
+            # load schema to dictionary if it is a valid json file
+            try:
+                with open(schema_path, "r") as f:
+                    self.schema = json.load(f)
+
+            except Exception as e:
+                err = ("Could not load json schema, "
+                       "Obtained error {}".format(e))
+
+                self._log.error(err)
+                raise Exception(err)
+
+        else:
+            self.schema = None
+
+    def to_list_of_documents(self, data: pd.DataFrame,
+                             grp_fields: list,
+                             schema: dict = None,
+                             _return_data: bool = False) -> list:
+        '''
+        Reshapes a pandas dataframe to a list of documents according
+         to a complex (json) mongodb schema
+
+         Remark1: column names of data need to reflect the "nestedness"
+         of the field in the mongodb schema with the help of a "." separator
+         Example: field.sub_field_1, field.sub_field_2
+
+         Remark2: if the schema is stored as a json file, first load it
+         to a dictionary with the help of the python json module
+        '''
+        from copy import deepcopy
+        from libraries.log import Log
+
+        log = Log("reshape_dataframe_to_list_of_documents:")
+
+        data = self._melt_duplicated_columns(data)
+
+        reshaped_fields = []
+
+        if schema is None:
+            schema = self.schema
+
+        for field in schema["properties"]:
+
+            if field not in self._unroll_nested_names(data.columns):
+                continue
+
+            field_type = schema["properties"][field]["bsonType"]
+
+            # if field has a simple type
+            if field_type not in ["array", "object"]:
+
+                grp_fields = [c for c in grp_fields if c in data.columns]
+
+                n_distinct_values = data.groupby(grp_fields)[field].nunique()\
+                                        .max()
+
+                if n_distinct_values != 1:
+                    err = "Field {0} is not unique with respect to {1}"\
+                          .format(field, grp_fields)
+
+                    log.error(err)
+                    raise Exception(err)
+
+                if field not in grp_fields:
+                    reshaped_field = data.groupby(grp_fields)[field].first()
+                else:
+                    reshaped_field =\
+                        data[grp_fields].drop_duplicates()\
+                        .set_index(grp_fields, drop=False)[field]
+
+                reshaped_fields.append(reshaped_field)
+
+            # if field is sub-document (dictionary)
+            elif field_type == "object":
+
+                sub_schema = deepcopy(schema["properties"][field])
+
+                # rename sub-schema properties to match with data column names
+                sub_schema["properties"] =\
+                    {".".join([field, k]): v for k, v
+                     in sub_schema["properties"].items()}
+
+                sub_data = self.to_list_of_documents(
+                            data=data,
+                            schema=sub_schema,
+                            grp_fields=grp_fields,
+                            _return_data=True)
+
+                reshaped_field = sub_data.apply(self._make_dict, axis=1)
+                reshaped_field.name = field
+
+                reshaped_fields.append(reshaped_field)
+
+            # if field is a list of dictionaries
+            elif field_type == "array":
+
+                items_type = schema["properties"][field]["items"]["bsonType"]
+
+                if items_type == "object":
+
+                    sub_schema = deepcopy(schema["properties"][field]["items"])
+
+                    # rename sub-schema properties to match data column names
+                    sub_schema["properties"] =\
+                        {".".join([field, k]): v for k, v in
+                         sub_schema["properties"].items()}
+
+                    # extend grp fields by sub-fields of field simple types
+                    sub_grp_fields =\
+                        [f for f in sub_schema["properties"]
+                         if sub_schema["properties"][f]["bsonType"]
+                         not in ["array", "object"]]
+
+                    if len(sub_grp_fields) == 0:
+                        err = ("One of the sub-keys in a list of documents"
+                               " must be of simple type for the field {}"
+                               .format(field))
+
+                        log.error(err)
+                        raise Exception(err)
+
+                    # group and reshape sub-fields with complex types
+                    sub_data = self.to_list_of_documents(
+                                data=data,
+                                schema=sub_schema,
+                                grp_fields=grp_fields + sub_grp_fields,
+                                _return_data=True)
+
+                    if sub_data is not None:
+
+                        # gether the results into a list of dictionaries
+                        sub_data = sub_data.apply(self._make_dict, axis=1)
+
+                        sub_data.name = field
+                        sub_data = sub_data.reset_index(grp_fields)
+
+                        reshaped_field =\
+                            sub_data.groupby(grp_fields)[field]\
+                                    .apply(self._make_list_of_distinct)
+
+                        reshaped_fields.append(reshaped_field)
+
+                # if field is a list of values with simple type
+                else:
+
+                    grp_fields = [c for c in grp_fields if c in data.columns]
+
+                    if field in data.columns:
+
+                        reshaped_field = data.groupby(grp_fields)[field]\
+                                           .apply(self._make_list_of_distinct)
+
+                        reshaped_fields.append(reshaped_field)
+
+        if len(reshaped_fields) > 0:
+            reshaped_data = pd.concat(reshaped_fields, axis=1)
+
+            if not _return_data:
+
+                list_of_documents =\
+                    reshaped_data.drop(list(reshaped_data.index.names),
+                                       axis=1, errors="ignore")\
+                                 .reset_index(drop=False)
+
+                log.info("Done reshaping the dataframe to a list of documents")
+
+                return list_of_documents
+
+            else:
+
+                return reshaped_data
+
+    def _melt_duplicated_columns(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        for c in set(data.columns):
+            if isinstance(data[c], pd.DataFrame):
+                data = pd.melt(data, id_vars=[cc for cc in data.columns
+                                              if cc != c], value_vars=c)\
+                         .drop("variable", axis=1)\
+                         .rename(columns={"value": c})
+
+        return data
+
+    def _make_dict(self, x: pd.Series) -> dict:
+        '''
+        return: transforms pandas series to a dictionary
+         is meant to be applied to a dataframe in axis = 1,
+         then the index of the input series are the column names
+         of the dataframe
+        '''
+        return {f.split(".")[-1]: x[f] for f in x.index}
+
+    def _make_list(self, x: pd.Series) -> list:
+        '''
+        return: list of values in a series
+        '''
+        return list(x)
+
+    def _make_list_of_distinct(self, x: pd.Series) -> list:
+        '''
+        return: list of unique values from a Series where
+         entries are arbitrary objects
+         (pandas unique() method does not work if entries are of complex types)
+        '''
+        distinct = []
+        [distinct.append(obj) for obj in x if obj not in distinct]
+        return distinct
+
+    def _unroll_nested_names(self, columns: list) -> list:
+        '''
+        '''
+        unrolled = []
+
+        for c in columns:
+            splitted = c.split(".")
+            for i in range(len(splitted)):
+                unrolled.append(".".join(splitted[:i+1]))
+
+        return unrolled
+
+
+if __name__ == "__main__":
+
+    # Testing
+
+    df = pd.DataFrame({
+                       "a": [1]*8 + [2]*8,
+                       "b": [10]*8 + [20]*8,
+                       "c": [100, 200]*8,
+                       "d.da": [11]*8 + [22]*8,
+                       "d.db": [33]*8 + [34]*8,
+                       "e.ea.eaa": [5]*8 + [55]*8,
+                       "e.ea.eab": [6]*8 + [66]*8,
+                       "e.eb": [2, 2, 3, 3]*4,
+                       "e.ec.eca": [1, 2, 3, 4]*4,
+                       "e.ec.ecb": [5, 6, 7, 8]*4,
+                       "f.fa": [1]*4 + [3]*4 + [11]*4 + [33]*4,
+                       "f.fb": [2]*4 + [3]*2 + [4]*2 + [22]*4 + [44]*4})
+
+    duplicate = pd.DataFrame({"c": [300, 400]*8})
+
+    df = pd.concat([df, duplicate], axis=1)
+
+    schm = {
+              "bsonType": "object",
+              "required": ["a"],
+              "properties": {
+
+                  "a": {"bsonType": "integer"},
+
+                  "b": {"bsonType": "integer"},
+
+                  "c": {
+                      "bsonType": "array",
+                      "items": {"bsonType": "integer"}
+                  },
+                  "d": {
+                      "bsonType": "object",
+                      "properties": {
+                          "da": {"bsonType": "integer"},
+                          "db": {"bsonType": "integer"}
+                       }
+                  },
+                  "e": {
+                      "bsonType": "object",
+                      "properties": {
+                          "ea": {
+                              "bsonType": "object",
+                              "properties": {
+                                  "eaa": {"bsonType": "integer"},
+                                  "eab": {"bsonType": "integer"}
+                               }
+
+                          },
+
+                          "eb": {
+                              "bsonType": "array",
+                              "items": {"bsonType": "integer"}
+                          },
+
+                          "ec": {
+                                "bsonType": "array",
+                                "items": {
+                                  "bsonType": "object",
+                                  "properties": {
+                                      "eca": {"bsonType": "integer"},
+                                      "ecb": {"bsonType": "integer"}
+                                    }
+                                  }
+                          }
+                      }
+                  },
+                  "f": {
+                      "bsonType": "array",
+                      "items": {
+                          "bsonType": "object",
+                          "properties": {
+                              "fa": {"bsonType": "integer"},
+                              "fb": {
+                                  "bsonType": "array",
+                                  "items": {"bsonType": "integer"}
+                              }
+                          }
+                      }
+                  }
+              }
+              }
+
+    grp_fields = ["a"]
+
+    result = DataFrameToCollection().to_list_of_documents(
+                    data=df,
+                    schema=schm,
+                    grp_fields=grp_fields)

+ 520 - 0
db_migration/MigrationCleaning.py

@@ -0,0 +1,520 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 25 08:09:52 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+import pandas as pd
+import numpy as np
+import gc
+
+sys.path.append(os.getcwd())
+
+from libraries.db_migration.ParseMapping import ParseMapping
+from libraries.db_migration.ParseJsonSchema import ParseJsonSchema
+from libraries.utils.ClassLogging import ClassLogging
+from libraries.utils.CleaningUtils import CleaningUtils
+
+
+class MigrationCleaning(ClassLogging):
+    '''
+    Class for correcting and filtering the incorrect data.
+    We keep the correcting and the filtering methods separated,
+    since there might be other custom steps in between.
+    '''
+    def __init__(self, mapping_path: str,
+                 schema_paths: (str, list),
+                 inconsist_report_table: str = None,
+                 filter_index_columns: (str, list) = None,
+                 mapping_source: str = "internal_name",
+                 mapping_target: str = "mongo_name",
+                 mapping_parser: type = ParseMapping,
+                 schema_parser: type = ParseJsonSchema,
+                 log_name: str = "MigrationCleaning"):
+        '''
+        '''
+        super().__init__(log_name=log_name)
+
+        assert isinstance(inconsist_report_table, str),\
+            "Inconsistent report table should be a tablename string"
+
+        self._inconsist_report_table = inconsist_report_table
+
+        assert isinstance(filter_index_columns, (str, list)),\
+            "Filter index columns must be a str or a list"
+
+        self._filter_index_columns = list(filter_index_columns)
+
+        self._schema_parser = schema_parser(schema_paths)
+
+        self._mapping_parser = mapping_parser(mapping_path,
+                                              source=mapping_source,
+                                              target=mapping_target)
+
+        self._mapping_path = mapping_path
+        self._schema_paths = schema_paths
+
+    def _assert_dataframe_input(self, data: pd.DataFrame):
+        '''
+        '''
+        assert(isinstance(data, pd.DataFrame)),\
+            "Parameter 'data' must be a pandas dataframe"
+
+    @property
+    def _field_mapping(self):
+        '''
+        '''
+        return self._mapping_parser.get_field_mapping()
+
+    @property
+    def _required_fields(self):
+        '''
+        '''
+        source_required_fields = self._mapping_parser.get_required_fields()
+        target_required_fields = self._schema_parser.get_required_fields()
+
+        for source_field, target_field in self._field_mapping.items():
+
+            if (target_field in target_required_fields) and\
+                    (source_field not in source_required_fields):
+
+                source_required_fields.append(source_field)
+
+        return source_required_fields
+
+    @property
+    def _default_values(self):
+        '''
+        '''
+        default_values = {}
+
+        target_default_values = self._schema_parser.get_default_values()
+        source_default_values = self._mapping_parser.get_default_values()
+
+        for source_field, target_field in self._field_mapping.items():
+
+            if source_field not in source_default_values:
+                continue
+
+            elif target_field not in target_default_values:
+
+                target_default_values[target_field] = np.nan
+
+            default_values[source_field] = {
+                    target_default_values[target_field]:
+                    source_default_values[source_field]
+                    }
+
+        return default_values
+
+    @property
+    def _python_types(self):
+        '''
+        '''
+        target_types = self._schema_parser.get_python_types()
+
+        result = {}
+
+        for source_field, target_field in self._field_mapping.items():
+
+            if target_field in target_types:
+                result[source_field] = target_types[target_field]
+
+            """
+            date_type_mismatch =\
+                    (target_field in target_types) and\
+                    (source_field in source_types) and\
+                    (target_types[target_field] == str) and\
+                    (source_types[source_field] == np.dtype('<M8[ns]'))
+
+            if date_type_mismatch:
+                target_types[target_field] = np.dtype('<M8[ns]')
+
+            if (source_field in source_types) and\
+                    (target_field in target_types) and\
+                    (target_types[target_field] != source_types[source_field]):
+
+                self.log_and_raise(("Type {0} of field {1} "
+                                    "in schema does not match "
+                                    "type {2} of field {3} in "
+                                    "migration mapping")
+                                   .format(target_types[target_field],
+                                           target_field,
+                                           source_types[source_field],
+                                           source_field))
+
+            if target_field in target_types:
+                source_types[source_field] = target_types[target_field]
+
+            """
+
+        return result
+
+    @property
+    def _value_mappings(self):
+        '''
+        '''
+        return self._mapping_parser.get_value_mappings()
+
+    @property
+    def _date_formats(self):
+        '''
+        '''
+        return self._mapping_parser.get_date_formats()
+
+    def _get_mongo_schema_info(self, method_name: str):
+        '''
+        '''
+        result = {}
+
+        target_dict = getattr(self._schema_parser, method_name)()
+
+        for source_field, target_field in self._field_mapping.items():
+
+            if target_field in target_dict:
+
+                result[source_field] = target_dict[target_field]
+
+        return result
+
+    @property
+    def _allowed_values(self):
+        '''
+        '''
+        return self._get_mongo_schema_info("get_allowed_values")
+
+    @property
+    def _minimum_values(self):
+        '''
+        '''
+        return self._get_mongo_schema_info("get_minimum_value")
+
+    @property
+    def _maximum_values(self):
+        '''
+        '''
+        return self._get_mongo_schema_info("get_maximum_value")
+
+    @property
+    def _patterns(self):
+        '''
+        '''
+        return self._get_mongo_schema_info("get_patterns")
+
+    def _filter_invalid_data(self, data: pd.DataFrame,
+                             invalid_mask: pd.Series,
+                             reason: (str, pd.Series)) -> pd.DataFrame:
+        '''
+        '''
+        from libraries.db_handlers.SQLHandler import SQLHandler
+
+        assert((self._inconsist_report_table is not None) and
+               (self._filter_index_columns is not None)),\
+            "Inconsistent report table or filter index is not provided"
+
+        self._assert_dataframe_input(data)
+
+        data = data.copy(deep=True)
+
+        db = SQLHandler()
+
+        if invalid_mask.sum() == 0:
+
+            return data
+
+        data_inconsist = data.assign(reason=reason)\
+                             .loc[invalid_mask]\
+                             .reset_index(drop=True)
+
+        db.append_to_table(data=data_inconsist,
+                           tablename=self._inconsist_report_table)
+
+        n_rows_filtered = len(data_inconsist)
+        n_instances_filtered = len(data_inconsist[self._filter_index_columns].drop_duplicates())
+
+        del data_inconsist
+        gc.collect()
+
+        self._log.warning(("Filtering: {0} ."
+                           "Filtered {1} rows "
+                           "and {2} instances"
+                           .format(reason, n_rows_filtered, n_instances_filtered)))
+
+        nok_index_data = data.loc[invalid_mask, self._filter_index_columns]\
+                             .drop_duplicates().reset_index(drop=True)
+
+        nok_index = pd.MultiIndex.from_arrays([nok_index_data[c] for c in
+                                               self._filter_index_columns])
+
+        all_index = pd.MultiIndex.from_arrays([data[c] for c in
+                                               self._filter_index_columns])
+
+        data = data.loc[~all_index.isin(nok_index)].reset_index(drop=True)
+
+        return data
+
+    def _replace_values(self, data: pd.DataFrame,
+                        default: bool) -> pd.DataFrame:
+        '''
+        '''
+        if default:
+            default_str = "default"
+        else:
+            default_str = "equal"
+
+        self._assert_dataframe_input(data)
+
+        data = data.copy(deep=True)
+
+        if default:
+            mapping = self._default_values
+        else:
+            mapping = self._value_mappings
+
+        for column, d in mapping.items():
+
+            try:
+
+                if column not in data.columns:
+                    continue
+
+                dtype = data[column].dtype
+
+                for key, values in d.items():
+
+                    if not default:
+
+                        mask = (data[column].astype(str).isin(values))
+
+                    else:
+                        mask = (data[column].isin(values))
+
+                    if default:
+
+                        mask = mask | (data[column].isnull())
+
+                    data.loc[mask, column] = key
+
+                data[column] = data[column].astype(dtype)
+
+            except Exception as e:
+
+                self.log_and_raise(("Failed to replace {0} values "
+                                    "in {1}. Exit with error {2}"
+                                    .format(default_str, column, e)))
+
+        self._log.info("Replaced {} values".format(default_str))
+
+        return data
+
+    def replace_default_values(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        return self._replace_values(data=data, default=True)
+
+    def map_equal_values(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        return self._replace_values(data=data, default=False)
+
+    def convert_types(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        self._assert_dataframe_input(data)
+
+        for column, python_type in self._python_types.items():
+
+            try:
+                if column not in data.columns:
+                    continue
+
+                elif column in self._date_formats:
+
+                    data[column] = CleaningUtils.convert_dates(
+                            series=data[column],
+                            formats=self._date_formats[column])
+
+                elif (python_type == int) and data[column].isnull().any():
+
+                    self.log_and_raise(("Column {} contains missing values "
+                                        "and cannot be of integer type"
+                                        .format(column)))
+
+                elif python_type == str:
+
+                    python_type = object
+
+                else:
+
+                    data[column] = data[column].astype(python_type)
+
+                if data[column].dtype != python_type:
+
+                    self._log.warning(("After conversion type in {0} "
+                                       "should be {1} "
+                                       "but is still {2}"
+                                       .format(column,
+                                               python_type,
+                                               data[column].dtype)))
+
+            except Exception as e:
+
+                self.log_and_raise(("Failed to convert types in {0}. "
+                                    "Exit with error {1}"
+                                    .format(column, e)))
+
+        self._log.info("Converted dtypes")
+
+        return data
+
+    def filter_invalid_null_values(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        self._assert_dataframe_input(data)
+
+        for column in data.columns:
+
+            if (column in self._required_fields) and\
+                    (data[column].isnull().any()):
+
+                invalid_mask = data[column].isnull()
+
+                reason = "Null value in the required field {}"\
+                         .format(column)
+
+                data = self._filter_invalid_data(data=data,
+                                                 invalid_mask=invalid_mask,
+                                                 reason=reason)
+
+        return data
+
+    def filter_invalid_types(self, data: pd.DataFrame) -> pd.DataFrame():
+        '''
+        '''
+        self._assert_dataframe_input(data)
+
+        for column, python_type in self._python_types.items():
+
+            if data[column].dtype != python_type:
+
+                def mismatch_type(x):
+                    return type(x) != python_type
+
+                invalid_mask = data[column].apply(mismatch_type)
+
+                reason = "Type mismatch if field {}".format(column)
+
+                data = self._filter_invalid_data(data=data,
+                                                 invalid_mask=invalid_mask,
+                                                 reason=reason)
+
+        return data
+
+    def filter_invalid_patterns(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        self._assert_dataframe_input(data)
+
+        for column, pattern in self._patterns:
+
+            invalid_mask = (~data[column].astype(str).str.match(pattern))
+
+            reason = "Pattern mismatch in field {}".format(column)
+
+            data = self._filter_invalid_data(data=data,
+                                             invalid_mask=invalid_mask,
+                                             reason=reason)
+
+        return data
+
+    def filter_notallowed_values(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        for column, value in self._minimum_values.items():
+
+            invalid_mask = data[column] > value
+
+            reason = "Too large values in field {}".format(column)
+
+            data = self._filter_invalid_data(data=data,
+                                             invalid_mask=invalid_mask,
+                                             reason=reason)
+
+        for column, value in self._maximum_values.items():
+
+            invalid_mask = data[column] < value
+
+            reason = "Too small values in field {}".format(column)
+
+            data = self._filter_invalid_data(data=data,
+                                             invalid_mask=invalid_mask,
+                                             reason=reason)
+
+        for column, allowed_values in self._allowed_values.items():
+
+            invalid_mask = (~data[column].isin(allowed_values))
+
+            reason = "Too small values in field {}".format(column)
+
+            data = self._filter_invalid_data(data=data,
+                                             invalid_mask=invalid_mask,
+                                             reason=reason)
+
+        return data
+
+
+if __name__ == "__main__":
+
+    # testing
+
+    from libraries.db_handlers.SQLHandler import SQLHandler
+
+    mapping_path = os.path.join(".", "migration_mappings", "rs1_mapping.json")
+
+    schema_paths = [
+            os.path.join(".", "mongo_schema", "schema_wheelsets.json"),
+            os.path.join(".", "mongo_schema", "schema_process_instances.json")]
+
+    inconsist_report_table = "test_inconsist_report_rs1"
+
+    if all([os.path.isfile(p) for p in schema_paths + [mapping_path]]):
+
+        print("Found schemas!")
+
+        cleaner = MigrationCleaning(
+                mapping_path=mapping_path,
+                schema_paths=schema_paths,
+                mapping_source="internal_name",
+                mapping_target="mongo_name",
+                filter_index_columns=["radsatznummer"],
+                inconsist_report_table=inconsist_report_table)
+
+        db = SQLHandler()
+
+        data = db.read_sql_to_dataframe("select * from rs1 limit 100")
+
+        data = cleaner.replace_default_values(data)
+
+        data = cleaner.map_equal_values(data)
+
+        data = cleaner.convert_types(data)
+
+        non_filtered_len = len(data)
+
+        data = cleaner.filter_invalid_types(data)
+
+        if len(data) < non_filtered_len:
+
+            data = cleaner.convert_types(data)
+
+        data = cleaner.filter_invalid_null_values(data)
+
+        data = cleaner.filter_invalid_patterns(data)
+
+        data = cleaner.filter_notallowed_values(data)
+
+    print("Done!")

+ 62 - 0
db_migration/ParseDbSchema.py

@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 25 08:22:20 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+import abc
+sys.path.append(os.getcwd())
+
+
+class ParseDbSchema(metaclass=abc.ABCMeta):
+    '''
+    '''
+    def __init__(self, schema_paths: [list, str], log_file: str = None):
+        '''
+        '''
+        from libraries.log import Log
+
+        self._log = Log(name="ParseDbSchema:", log_file=log_file)
+
+        if isinstance(schema_paths, str):
+            schema_paths = [schema_paths]
+
+        for schema_path in schema_paths:
+            if not os.path.isfile(schema_path):
+                err = "Schema not found"
+                self._log.error(err)
+                raise FileNotFoundError(err)
+
+    @abc.abstractmethod
+    def get_fields(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_datetime_fields(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_python_types(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_default_values(self) -> list:
+        '''
+        '''
+        return
+
+    @abc.abstractmethod
+    def get_allowed_values(self) -> list:
+        '''
+        '''
+        return

+ 332 - 0
db_migration/ParseJsonSchema.py

@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jan 31 11:41:48 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+from copy import deepcopy
+import numpy as np
+
+sys.path.append(os.getcwd())
+
+from libraries.db_migration.ParseDbSchema import ParseDbSchema
+
+
+class ParseJsonSchema(ParseDbSchema):
+    '''
+    Class for retrieving column properties from mongodb jsonSchema
+    '''
+
+    def __init__(self, schema_paths: [list, str], log_file: str = None):
+        '''
+        '''
+        import json
+        from libraries.log import Log
+
+        super().__init__(schema_paths=schema_paths, log_file=log_file)
+
+        self._log = Log(name="ParseJsonSchema", log_file=log_file)
+
+        # load schemas to dictionaries if they are valid json files
+
+        assert(isinstance(schema_paths, (list, str))),\
+            "Schema paths must be either str or lists"
+
+        if isinstance(schema_paths, str):
+            schema_paths = [schema_paths]
+
+        self.schemas = []
+
+        for schema_path in schema_paths:
+            try:
+                with open(schema_path, "r") as f:
+                    self.schemas.append(json.load(f))
+
+            except Exception as e:
+                err = ("Could not load json schema, "
+                       "Obtained error {}".format(e))
+
+                self._log.error(err)
+                raise Exception(err)
+
+    def get_fields(self) -> list:
+        '''
+        '''
+        return self._parse()
+
+    def get_required_fields(self) -> list:
+        '''
+        '''
+        return self._parse(required_only=True)
+
+    def get_mongo_types(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="bsonType")
+
+    def get_datetime_fields(self):
+        '''
+        '''
+        mongo_types = self.get_mongo_types()
+
+        return [k for k, v in mongo_types.items()
+                if v in ["date", "timestamp", "Date", "Timestamp"]]
+
+    def get_python_types(self) -> dict:
+        '''
+        '''
+        mongo_types = self.get_mongo_types()
+        python_types = {}
+
+        bson_to_python_types_except_dates = {"double": float,
+                                             "decimal": float,
+                                             "string": str,
+                                             "object": object,
+                                             "array": list,
+                                             "bool": bool,
+                                             "int": int,
+                                             "long": int,
+                                             "date": np.dtype('<M8[ns]'),
+                                             "timestamp": np.dtype('<M8[ns]')
+                                             }
+
+        for k, v in mongo_types.items():
+
+            if isinstance(v, list):
+                if ("date" in v) or ("timestamp" in v):
+                    v = "date"
+                elif "string" in v:
+                    v = "string"
+                elif ("double" in v) or ("decimal" in v):
+                    v = "double"
+                elif ("null" in v) and (len(v) == 2) and ("int" not in v):
+                    v = [t for t in v if type != "null"][0]
+                else:
+                    err = "Type {0}: {1} not convertibale".format(k, v)
+                    self._log.error(err)
+                    raise Exception(err)
+
+            if v in bson_to_python_types_except_dates:
+                python_types[k] = bson_to_python_types_except_dates[v]
+
+        return python_types
+
+    def get_patterns(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="pattern")
+
+    def get_default_values(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="default")
+
+    def get_allowed_values(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="enum")
+
+    def get_maximum_value(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="maximum")
+
+    def get_minimum_value(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="minimum")
+
+    def get_max_items(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="maxItems")
+
+    def get_min_items(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="minItems")
+
+    def get_field_descriptions(self) -> dict:
+        '''
+        '''
+        return self._parse(field_info="description")
+
+    def _parse(self,
+               field_info: str = None,
+               required_only: bool = False):
+        '''
+        '''
+        result = self._parse_one(schema=self.schemas[0],
+                                 field_info=field_info,
+                                 required_only=required_only)
+
+        for schema in self.schemas[1:]:
+
+            next_result = self._parse_one(schema=schema,
+                                          field_info=field_info,
+                                          required_only=required_only)
+
+            if isinstance(result, list):
+                result.extend(next_result)
+            else:
+                result.update(next_result)
+
+        return result
+
+    def _parse_one(self,
+                   schema: dict,
+                   field_info: str = None,
+                   required_only: bool = False,
+                   super_field_name: str = None,
+                   already_parsed: (list, dict) = None) -> (list, dict):
+        '''
+        Recursive function that returns a list of (nested) field names or
+        a dictionary of (nested) field names with field characteristics.
+
+        :param schema: if None => entire self.schema, or a sub-schema
+            of self.schema
+
+        :param field_info: optional, if provided a dictionary of field
+            names with field characteristics is returned (for examples
+            bsonType of each field), else a list of fields is returned
+
+        :param required_only: when True, only returns fields marked as
+            required in the mongo schema
+
+        :param super_field_name: needed for recursion
+            Example: the field 'article' has
+            subfields 'id' and 'supplier'.
+            If we parse the sub-document corresponding to article, then
+            super_field_name is'article' and we might get an output like
+            {'article.id': string, 'article.supplier': string}
+
+        :param alread_parsed: needed for recursion
+
+        '''
+        schema = deepcopy(schema)
+
+        assert(isinstance(schema, dict)),\
+            "Parameter 'schema' must be a dict"
+
+        if field_info is None:
+            # parse a list of fields
+            if already_parsed is None:
+                already_parsed = []
+            else:
+                assert(isinstance(already_parsed, list)),\
+                    "Parameter 'already_parsed' must be of type list"
+        else:
+            # parse a dictionary of field names with field characteristics
+            if already_parsed is None:
+                already_parsed = {}
+            else:
+                assert(isinstance(already_parsed, dict)),\
+                    "Parameter 'already_parsed' must be of type dict"
+
+        # If schema is nested, then
+        # either it is of bsonType object
+        # and the field information is stored under the key 'properties'
+        # or it is of bsonType array
+        # and the field information is stored in sub-schemas
+        # under the key 'items'
+
+        # if schema is of bsonType object
+        if "properties" in schema.keys():
+            if "required" in schema.keys():
+                required_subfields = schema["required"]
+
+            for sub_field_name in schema["properties"].keys():
+
+                sub_schema = schema["properties"][sub_field_name]
+
+                # only process fields that are required
+                if required_only and\
+                        (sub_field_name not in required_subfields):
+                    pass
+                else:
+                    if super_field_name is not None:
+                        field_name = '.'.join([super_field_name,
+                                               sub_field_name])
+                    else:
+                        field_name = sub_field_name
+
+                    # if the given sub-field is nested, parse the
+                    # sub-schema corresponding to this sub-field
+                    self._parse_one(
+                            schema=sub_schema,
+                            super_field_name=field_name,
+                            field_info=field_info,
+                            already_parsed=already_parsed,
+                            required_only=required_only)
+
+        # if schema is of bsonType array
+        elif "items" in schema.keys():
+            # one schema for all items
+            if isinstance(schema["items"], dict):
+
+                sub_schema = schema["items"]
+
+                self._parse_one(schema=sub_schema,
+                                super_field_name=super_field_name,
+                                field_info=field_info,
+                                already_parsed=already_parsed,
+                                required_only=required_only)
+
+            # list of separate schemas for each item
+            elif isinstance(schema["items"], list):
+
+                for sub_schema in schema["items"]:
+                    self._parse_one(schema=sub_schema,
+                                    super_field_name=super_field_name,
+                                    field_info=field_info,
+                                    already_parsed=already_parsed,
+                                    required_only=required_only)
+            else:
+                raise Exception(('Schema is not composed correctly: '
+                                 'items must be a dictionary or a list'))
+        else:
+            # If neither properties nor items is in schema keys
+            # we reached the last level of nestedness,
+            # field information is stored in the schema keys.
+            field_name = super_field_name
+
+            if field_info is None:
+                already_parsed.append(field_name)
+            else:
+                if field_info in schema.keys():
+                    already_parsed[field_name] = schema[field_info]
+                else:
+                    pass
+
+        return already_parsed
+
+
+if __name__ == "__main__":
+
+    # Only for testing
+
+    schema_path = os.path.join(".", "mongo_schema", "schema_wheelsets.json")
+
+    if os.path.isfile(schema_path):
+
+        parse_obj = ParseJsonSchema(schema_paths=schema_path)
+
+        fields = parse_obj.get_fields()
+
+        required_fileds = parse_obj.get_required_fields()
+
+        patterns = parse_obj.get_patterns()
+
+        mongo_types = parse_obj.get_mongo_types()
+
+        python_types_except_dates = parse_obj.get_python_types()
+
+        datetime_fields = parse_obj.get_datetime_fields()
+
+        allowed_values = parse_obj.get_allowed_values()
+
+        descriptions = parse_obj.get_field_descriptions()

+ 157 - 0
db_migration/ParseMapping.py

@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 20 15:33:17 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+import numpy as np
+sys.path.append(os.getcwd())
+
+
+class ParseMapping:
+    '''
+    '''
+    def __init__(self, mapping_path: str, log_name: str = "ParseMapping",
+                 source: str = "original_name", target: str = "original_name"):
+        '''
+        '''
+        import json
+        from libraries.log import Log
+
+        self._log = Log(log_name)
+
+        if not os.path.isfile(mapping_path):
+            err = "Mapping not found"
+            self._log.error(err)
+            raise FileNotFoundError(err)
+
+        try:
+            with open(mapping_path, "r") as f:
+                self._mapping = json.load(f)
+
+        except Exception as e:
+            err = ("Could not load mapping. "
+                   "Exit with error {}".format(e))
+            self._log.error(err)
+            raise Exception(err)
+
+        self._source = source
+        self._target = target
+
+    def get_field_mapping(self) -> dict:
+        '''
+        '''
+        assert(all([set([self._source, self._target]) <= set(d)
+                    for d in self._mapping]))
+
+        return {d[self._source]: d[self._target] for d in self._mapping}
+
+    def _get_fields_satistisfying_condition(self, key: str, value) -> list:
+        '''
+        '''
+        assert(all([self._source in d for d in self._mapping])),\
+            "Invalid from field"
+
+        return [d[self._source] for d in self._mapping
+                if (key in d) and (d[key] == value)]
+
+    def get_required_fields(self) -> list:
+        '''
+        '''
+        return self._get_fields_satistisfying_condition(key="required",
+                                                        value=1)
+
+    def get_date_fields(self) -> list:
+        '''
+        '''
+        return self._get_fields_satistisfying_condition(key="type",
+                                                        value="Date")
+
+    def _get_info(self, key: str, value=None) -> dict:
+        '''
+        '''
+        assert(all([self._source in d for d in self._mapping])),\
+            "Invalid from field"
+
+        return {d[self._source]: d[key] for d in self._mapping
+                if (key in d) and ((value is not None)
+                and (d[key] == value)) or (key in d)}
+
+    def get_default_values(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="default_values")
+
+    def get_date_formats(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="date_format")
+
+    def get_types(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="type")
+
+    def get_python_types(self) -> dict:
+        '''
+        '''
+        sql_to_python_dtypes = {
+                "Text": str,
+                "Date": np.dtype('<M8[ns]'),
+                "Double": float,
+                "Integer": int
+                }
+
+        sql_types = self.get_types()
+
+        return {k: sql_to_python_dtypes[v] for k, v in sql_types.items()}
+
+    def get_value_mappings(self) -> dict:
+        '''
+        '''
+        return self._get_info(key="value_mapping")
+
+    def get_column_numbers(self) -> list:
+        '''
+        '''
+        if all(["column_number" in d for d in self._mapping]):
+            column_numbers = [d["column_number"] for d in self._mapping]
+
+        elif all(["column_number" not in d for d in self._mapping]):
+            column_numbers = list(range(len(self._mapping)))
+
+        else:
+            err = ("Incorrectly filled mapping. Column numbers should ",
+                   "either in all or in neither of the fields")
+            self.log.err(err)
+            raise Exception(err)
+
+        return column_numbers
+
+
+if __name__ == "__main__":
+
+    mapping_path = os.path.join(".", "migration_mappings", "rs0_mapping.json")
+
+    if os.path.isfile(mapping_path):
+
+        print("found mapping path")
+
+        parser = ParseMapping(mapping_path, source="internal_name",
+                              target="mongo_name")
+
+        internal_to_mongo_mapping = parser.get_field_mapping()
+
+        original_to_internal_mapping = parser.get_field_mapping()
+
+        default_values = parser.get_default_values()
+
+        types = parser.get_types()
+
+        column_numbers = parser.get_column_numbers()
+
+        print("Done testing!")

BIN
db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc


BIN
db_migration/__pycache__/MigrationCleaning.cpython-37.pyc


BIN
db_migration/__pycache__/ParseDbSchema.cpython-37.pyc


BIN
db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc


BIN
db_migration/__pycache__/ParseMapping.cpython-37.pyc


+ 798 - 0
hyperopt/HyperoptPipelineSelection.py

@@ -0,0 +1,798 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Nov  9 13:27:44 2018
+
+@author: tanja
+@description: Implementation of machine learning
+                pipeline selection and tuning with hyperopt library
+"""
+
+import os
+import sys
+import gc
+import logging
+import pickle
+import time
+import datetime
+
+import pandas as pd
+import numpy as np
+
+from sklearn.pipeline import Pipeline
+
+from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
+    space_eval, pyll
+
+from sklearn.model_selection import cross_validate
+
+
+class HyperoptPipelineSelection:
+    '''
+    Use this class to perform a search
+    for a machine learning pipeline in a given parameter space.
+    The parameter space can include multiple types of Pipelines
+    (SVM, XGBOOST, random forest, etc),
+    as well as parameter distributions for each pipeline parameter.
+    See example in main for the expected space structure.
+
+    The search can be performed either randomly
+    or with a tree-based algorithm. (Other methods are currently
+    developped by hyperopt creators).
+
+    Attribute trials is responsible for book-keeping parameter
+    combinations that have already been tried out. This attribute
+    is saved to a binary file every n minutes as well as every time
+    a better pipeline was found.
+    '''
+    def __init__(self,
+                 cost_func,
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: int = 1,
+                 log_path: str = None,
+                 averaging_func: callable = None):
+        '''
+        :param callable cost_func: function to minimize or maximize
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from
+            the beginning.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+
+        :param str log_path: Optional, when not provided logs to stdout.
+
+        :param callable averaging_func: optional,
+            when not provided set to mean. Function
+            to aggregate the cross-validated values of the cost function.
+            Classic situation is to take the mean,
+            another example is, for example mean() - c*var().
+        '''
+
+        assert(callable(cost_func)),\
+            "Parameter 'cost_func' must be a callable"
+
+        assert(isinstance(greater_is_better, bool)),\
+            "Parameter 'greater_is_better' must be bool type"
+
+        assert(isinstance(trials_path, str)),\
+            "Parameter 'trials_path' must be of string type"
+
+        if averaging_func is not None:
+            assert(callable(averaging_func)),\
+                "Parameter 'averaging_func' must be a callable"
+
+        self._assert_valid_directory(path=trials_path)
+
+        self._configer_logger(log_path)
+
+        self._cost_func = cost_func
+        # is 1 when cost_func is minimized, -1 when cost func is maximized
+        self._score_factor = (not greater_is_better) - greater_is_better
+        self._trials_path = trials_path
+        # is initialized with empty trials object
+        self._trials = Trials()
+        self._backup_trials_freq = backup_trials_freq
+        self._averaging_func = averaging_func or np.mean
+        # keeping track of the current search iteration
+        self._run_number = 0
+        # space and data need to be attached to perform search.
+        self._space_attached = False
+        self._data_attached = False
+
+        # if a trials object already exists at the given path,
+        # it is loaded and the search is continued. Else,
+        # the search is started from the beginning.
+        if os.path.isfile(trials_path):
+            try:
+                with open(trials_path, "rb") as f:
+                    self._trials = pickle.load(f)
+
+                self._logger.info(("Loaded an existing trials object"
+                                   "Consisting of {} trials")
+                                  .format(len(self._trials.trials)))
+
+            except Exception as e:
+                self._logger.error(("Trials object could not be loaded. "
+                                    "Training starts from the beginning. "
+                                    "Exit with error {}").format(e))
+
+        else:
+            self._logger.info(("No existing trials object was found"
+                               "Initialized an empty trials object."))
+
+        self._best_score = self.best_trial_score
+
+    def _configer_logger(self, log_path: str = None):
+        '''
+        Can be replaced with the existing script later.
+        When log_path is not provided, logs to stdout.
+        '''
+
+        self._logger = logging.getLogger(__name__)
+
+        if (self._logger.hasHandlers()):
+            self._logger.handlers.clear()
+
+        if log_path is not None:
+            assert(isinstance(log_path, str)),\
+                "Parameter 'log_path' must be of string type"
+            self._assert_valid_directory(log_path)
+
+            handler = logging.FileHandler(log_path)
+        else:
+            handler = logging.StreamHandler(sys.stdout)
+
+        formatter = logging.Formatter(
+                '\n %(asctime)s %(levelname)s %(message)s')
+
+        handler.setFormatter(formatter)
+        self._logger.addHandler(handler)
+        self._logger.setLevel("INFO")
+
+    def _backup_trials(self):
+        '''
+        Pickles (Saves) the trials object.
+        Used in a scheduler.
+        '''
+        with open(self._trials_path, "wb") as f:
+            pickle.dump(self._trials, f)
+
+    def _assert_valid_directory(self, path: str):
+        '''
+        If the directory of a path does not exist yet,
+        creates it.
+        '''
+        assert(isinstance(path, str)),\
+            "Parameter 'path' must of str type"
+
+        dirname = os.path.dirname("path")
+
+        if len(dirname) > 0:
+            os.mkdir(dirname, exists_ok=True)
+
+    def attach_space(self, space: pyll.base.Apply = None,
+                     module_path: str = None,
+                     name: str = None):
+        '''
+        :param pyll.base.Apply space: hyperopt space where
+            the search is performed. Optional when a space
+            is loaded from a python module.
+
+        :param str module_path: path to python module
+            where the space is defined. Optional when
+            the space is provided directly.
+
+        :param str name: name of the space loaded from
+            a python module. Optional when the space
+            is provided directly.
+        '''
+        assert((space is not None) or
+               ((module_path is not None) and (name is not None))),\
+            "Either space or (module_path, name) must be provided"
+
+        if space is None:
+            for p in ["modele_path", "name"]:
+                assert(isinstance(p, str)),\
+                    "Parameter '{}' must be of str type".format(p)
+
+            assert(os.path.isfile(module_path)),\
+                "Parameter 'module_path' must be a valid file"
+
+            module, extension = os.path.splitext(os.path.basename(module_path))
+            assert(extension == ",py"),\
+                "Parameter 'space' must be read from a python file"
+
+            sys.path.insert(module_path)
+
+            try:
+                from module import name as space
+            except ImportError:
+                err = "Invalid space location or name"
+                self._logger.error(err)
+                raise Exception(err)
+
+        assert(isinstance(space, pyll.base.Apply)),\
+            "Parameter 'space' must be of hyperopt space type"
+
+        self._space = space
+        self._logger.info("Attached parameter distribution space")
+        self._space_attached = True
+
+    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
+            -> np.ndarray:
+        '''
+        Converts an DataFrame to an numpy array.
+        '''
+        if isinstance(x, np.ndarray):
+            return x
+
+        elif (isinstance(x, pd.core.frame.DataFrame))\
+                or (isinstance(x, pd.core.series.Series)):
+            return x.values
+
+        else:
+            e = 'The argument must be a numpy array or a pandas DataFrame'
+            self._logger.critical(e)
+            raise ValueError(e)
+
+    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
+                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
+                    X_val: (pd.DataFrame, np.ndarray) = None,
+                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
+                    cv: (list, int) = None):
+        '''
+        :param array X_train: data on which
+            machine learning pipelines are trained
+
+        :param array y_train: optional, vector with targets,
+            (not all algorithms require a targets)
+
+        :param array X_val: optional, validation data.
+            When not provided, cross-validated value
+            of the cost_func is calculated.
+
+        :param array y_val: optional, validation targets
+
+        :param list cv: list of tuples containing
+            train and validation indices or an integer representing
+            the number of folds for a random split of data
+            during cross-validation
+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
+        '''
+
+        X_train = self._convert_to_array(X_train)
+        if y_train is not None:
+            y_train = self._convert_to_array(y_train)
+
+        if X_val is not None:
+            if cv is not None:
+                self._logger.warning(("Both validation set and cv object "
+                                      "are set. Validation score will be "
+                                      "calculated on the validation set!"))
+
+            X_val = self._convert_to_array(X_val)
+
+            train_inds = list(range(len(X_train)))
+            val_inds = list(range(len(X_train),
+                                  len(X_train) + len(X_val)))
+
+            # cost is evaluated with a cross validation function
+            # that accepts an array and a cv object with
+            # indices of the fold splits.
+            # Here we create a trivial cv object
+            # with one validation split.
+            self._cv = [(train_inds, val_inds)]
+            self._X = np.concatenate([X_train, X_val])
+
+            if y_train is not None:
+                if y_val is None:
+                    err = "Argument y_val must be provided"
+                    self._logger.critical(err)
+                    raise ValueError(err)
+                else:
+                    y_val = self._convert_to_array(y_val)
+                    self._y = np.concatenate([y_train, y_val])
+            else:
+                self._y = None
+        else:
+            if cv is None:
+                self._logger.warning(("Neither validation set nor cv object "
+                                      "are set. Validation score will be "
+                                      "calculated on 5 randomly "
+                                      "splitted folds."))
+
+            self._X = X_train
+            self._y = y_train
+            self._cv = cv
+
+        self._logger.info("Attached data")
+        self._data_attached = True
+
+    def _evaluate(self, pipeline: Pipeline) -> dict:
+        '''
+        This method is called in _objective.
+
+        Calculates the cost on the attached data.
+        This function can be overriden, when the cost
+        needs to be calculated differently,
+        for example with a tensorflow model.
+
+        :param Pipeline pipeline: machine learning pipeline
+            that will be evaluated with cross-validation
+
+        :output: dictionary with the aggregated
+            cross-validation score and
+            the score variance.
+        '''
+
+        scores = cross_validate(estimator=pipeline,
+                                X=self._X,
+                                y=self._y,
+                                cv=self._cv or 5,
+                                scoring=make_scorer(self._cost_func),
+                                error_score=np.nan)
+
+        return {'value': self._averaging_func(scores['test_score']),
+                'variance': np.var(scores['test_score'])}
+
+    def _objective(self, space_element: dict) -> dict:
+        '''
+        This method is called in search_for_best_pipeline
+        inside the hyperopt fmin method.
+
+        Uses _evaluate method.
+
+        It must take as input a space element
+        and produce an output in the form of dictionary
+        with 2 obligatory values loss and status
+        (STATUS_OK or STATUS_FAIL). Other
+        values in the output are optional and can be
+        accessed later through the trials object.
+
+        :Warning: fmin minimizes the loss,
+        when _evaluate returns a value to be maximized,
+        it should be multiplied by -1 to obtain loss.
+
+        :param dict space_element: must contain keys
+            name (with the name of the pipeline),
+            pipeline (Pipeline object),
+            params (dict of pipeline params)
+
+        :output: dictionary with keys
+            loss (minimized value),
+            status with values STATUS_OK or STATUS_FAIL
+            uderstood by hyperopt,
+            score (equal to loss or -loss),
+            score_variance,
+            timestamp (end of execution),
+            train_time: execution time
+        '''
+        assert(isinstance(space_element, dict) and
+               set(['name', 'pipeline', 'params']) <= space_element.keys())
+
+        assert(isinstance(space_element['name'], str) and
+               isinstance(space_element['pipeline'], Pipeline) and
+               isinstance(space_element['params'], dict))
+
+        start_time = time.time()
+
+        if not self._data_attached:
+            raise Exception(("Data must be attached in order "
+                             "in order to effectuate the best"
+                             "pipeline search"))
+
+        self._run_number += 1
+
+        pipeline = space_element['pipeline']
+        params = space_element['params']
+        pipeline.set_params(**params)
+
+        self._logger.info(("Run number {0}: "
+                           "Current score is {1}: "
+                           "Training pipeline {2} "
+                           "with parameters: {3}. ").format(
+                             self._run_number,
+                             self._best_score,
+                             space_element['name'],
+                             params))
+
+        try:
+            score_stats = self._evaluate(pipeline)
+            assert(not np.isnan(score_stats["value"])),\
+                "Returned null score"
+
+            if self._run_number % self._backup_trials_freq == 0:
+                self._backup_trials()
+
+            if (self._best_score != self._best_score) or\
+                self._score_factor*score_stats["value"] <\
+                    self._score_factor*self._best_score:
+
+                self._logger.info("Score got better, new best score is: {}"
+                                  .format(score_stats["value"]))
+
+                self._best_score = score_stats['value']
+
+                self._backup_trials()
+
+            end_time = time.time()
+
+            return {'loss': self._score_factor * score_stats["value"],
+                    'status': STATUS_OK,
+                    'score': score_stats["value"],
+                    'score_variance': score_stats["variance"],
+                    'timestamp': datetime.datetime.today(),
+                    'train_time': end_time - start_time}
+
+        except Exception as e:
+
+            self._logger.warning("Trial failed with error {}".format(e))
+
+            return {'loss': np.nan,
+                    'status': STATUS_FAIL,
+                    'score': np.nan,
+                    'score_variance': np.nan,
+                    'timestamp': datetime.datetime.today(),
+                    'train_time': np.nan}
+
+    def search_for_best_pipeline(self,
+                                 niter: int,
+                                 algo: callable = tpe.suggest):
+        '''
+        Method performing the search of the best pipeline in the given space.
+        Calls fmin function from the hyperopt library to minimize the output of
+        _objective.
+
+        :params int niter: number of search iterations
+        :param callable algo: now can only take values tpe for a tree-based
+            random search or random for random search
+        '''
+        assert(self._space_attached),\
+            "Space must be attach to be able to retrieve this information."
+
+        assert(isinstance(niter, int)),\
+            "Parameter 'niter' must be of int type"
+
+        # right now only two algorithms are provided by
+        assert(algo in [tpe.suggest, rand.suggest]),\
+            ("Parameter 'algo' can be now only tpe or random. "
+             "If other algorithms have been developped by "
+             "by hyperopt, plased add them to the list.")
+
+        try:
+            self._logger.info(("Starting {0} iterations of search "
+                               "additional to {1} previous"
+                               .format(niter, len(self._trials.trials))))
+
+            best = fmin(fn=self._objective,
+                        space=space,
+                        algo=algo,
+                        trials=self._trials,
+                        max_evals=len(self._trials.trials) + niter)
+
+            # print('AAAA', str(niter))
+
+            self._logger.info(
+                    "Best score is {0} with variance {1}"
+                    .format(
+                     self._trials.best_trial["result"]["score"],
+                     self._trials.best_trial["result"]["score_variance"]))
+
+            self._logger.info(("Finished {0} iterations of search.\n"
+                               "Best parameters are:\n {1} ")
+                              .format(niter,
+                                      space_eval(space, best)))
+
+            self._backup_trials()
+
+        except Exception as e:
+            raise ValueError(("Failed to select best "
+                             "pipeline! Exit with error: {}").format(e))
+
+    @property
+    def best_trial_score(self) -> float:
+        '''
+        '''
+        if len(self._trials.trials) > 0:
+            return self._trials.best_trial["result"]["score"]
+        else:
+            return np.nan
+
+    @property
+    def best_trial_score_variance(self) -> float:
+        '''
+        '''
+        if len(self._trials.trials) > 0:
+            return self._trials.best_trial["result"]["score_variance"]
+        else:
+            return np.nan
+
+    @property
+    def best_trial_pipeline(self) -> Pipeline:
+        '''
+        '''
+        assert(self._space_attached),\
+            "Space must be attach to be able to retrieve this information."
+
+        if len(self._trials.trials) > 0:
+
+            return space_eval(
+                    space,
+                    {k: v[0] for k, v in
+                     self._trials.best_trial['misc']['vals'].items()
+                     if len(v) > 0})["pipeline"]
+        else:
+            err = ("Trials object is empty. "
+                   "Best pipeline cannot be returned")
+
+            self._logger.error(err)
+            raise Exception(err)
+
+    def _ith_trial_loss(self, i: int) -> float:
+        '''
+        '''
+        if len(self._trials.trials) >= i:
+            return self._trials.trials[i]['result']['loss']
+        else:
+            return np.nan
+
+    def _ith_trial_element(self, i: int, name: str) -> object:
+        '''
+        '''
+        assert(self._space_attached),\
+            "Space must be attach to be able to retrieve this information."
+
+        if len(self._trials.trials) >= i:
+            return space_eval(self._space,
+                              {k: v[0] for k, v in
+                               self._trials.trials[i]['misc']['vals']
+                               .items() if len(v) > 0})[name]
+
+    def _ith_trial_pipeline(self, i: int) -> Pipeline:
+        '''
+        '''
+        return self._ith_trial_element(i=i, name='pipeline')
+
+    def _ith_trial_name(self, i: int) -> str:
+        '''
+        '''
+        return self._ith_trial_element(i=i, name='name')
+
+    def _ith_trial_params(self, i: int) -> dict:
+        '''
+        '''
+        return self._ith_trial_element(i=i, name='params')
+
+    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
+        '''
+        '''
+        if len(self._trials.trials) >= i:
+            return self._trials.trials[i]["result"]["timestamp"]
+
+    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
+        '''
+        Returns the list of n best pipelines
+        documented in trials
+        '''
+        if len(self._trials.trials) > 0:
+            if losses is None:
+                losses = [self._ith_trial_loss(i)
+                          for i in range(len(self._trials.trials))]
+
+            best_n_indices = [losses.index(l)
+                              for l in sorted(list(set(losses)))[:n]]
+
+            return [self._ith_trial_pipeline(i) for i in best_n_indices]
+        else:
+            err = ("Trials object is empty. "
+                   "Best pipeline cannot be returned")
+
+            self._logger.error(err)
+            raise Exception(err)
+
+    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
+        '''
+        Returns a dictiionry where keys are pipeline names,
+        and values are lists of best pipelines with this name
+        '''
+        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
+
+        if len(self._trials.trials) > 0:
+
+            best_pipelines_per_type = {}
+            names = [self._ith_trial_name(i)
+                     for i in range(len(self._trials.trials))]
+
+            for nm in names:
+                losses = [self._ith_trial_loss(i)
+                          for i in range(len(self._trials.trials))
+                          if self._ith_trial_name(i) == nm]
+
+                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
+                                                        n=n,
+                                                        losses=losses)
+
+            return best_pipelines_per_type
+
+        else:
+            err = ("Trials object is empty. "
+                   "Best pipeline cannot be returned")
+
+            self._logger.error(err)
+            raise Exception(err)
+
+    def write_trials_documentation(self, path: str = None):
+        '''
+        Saves an excel file with pipeline names, scores,
+        parameters, and timestamps.
+        '''
+        if len(self._trials.trials) > 0:
+            path = path or "hyperopt_trials_documentation.xlsx"
+
+            assert(isinstance(path, str)),\
+                "Parameter 'path' must be of string type"
+
+            self._assert_valid_directory(path)
+
+            names = [self._ith_trial_name(i)
+                     for i in range(len(self._trials.trials))]
+            scores = [self._score_factor*self._ith_trial_loss(i)
+                      for i in range(len(self._trials.trials))]
+            params = [self._ith_trial_params(i)
+                      for i in range(len(self._trials.trials))]
+            timestamps = [self._ith_trial_timestamp(i)
+                          for i in range(len(self._trials.trials))]
+
+        else:
+            names = []
+            scores = []
+            params = []
+            timestamps = []
+
+        pd.DataFrame({"name": names,
+                      "score": scores,
+                      "params": params,
+                      "timestamp": timestamps})\
+          .to_excel(path)
+
+
+if __name__ == '__main__':
+
+    from sklearn.metrics import roc_auc_score, make_scorer
+    from xgboost import XGBClassifier
+    from sklearn.svm import SVC
+    from sklearn.feature_selection import SelectKBest
+    from sklearn.decomposition import PCA
+    from sklearn.datasets import load_iris
+    from pprint import pprint
+
+    data = load_iris()
+    X = pd.DataFrame(data.data)
+    y = pd.Series(data.target)
+    # produce a binory variable
+    y = (y == 2).astype(int)
+    del data
+    gc.collect()
+
+    # SPACE DEFINITION ########################################
+    # (can be moved to a separate python script)
+
+    """
+    A search space must be a list of dictionaries.
+    Each dictionry must have keys:
+        name (pipeline name or type),
+        pipeline (instance of sklearn.pipeline.Pipeline),
+        params (dictionary of distributions for the parameters of
+                the pipeline that we want to tune)
+
+    Here we have a space that consists of two dictionaries:
+    KBEST_XGBOOST and PCA_SVC
+    """
+    space = []
+
+    pipeline_dist_1 = {}
+    pipeline_dist_1["name"] = "KBEST_XGBOOST"
+
+    """
+    A pipeline consists of steps (tuples).
+    Each step has a name and an algorithm.
+    This pipeline, as a first step performs
+    feature selection with SelectKBest and
+    as a second step evaluates a machine learning algo (xgboost).
+
+    Like all sklearn algorithms, a Pipeline has methods
+    fit, predict, set_params, get_params
+    """
+    pipeline_dist_1["pipeline"] = Pipeline([
+                                     ('kbest', SelectKBest()),
+                                     ('xgb', XGBClassifier())
+                                     ])
+    """
+    Pipeline parameter dictionaries must be of the form:
+    {'kbest__k': 3, xgb__n_estimators: 20},
+    each parameter name consists of the step name, __, and parameter name.
+
+    Here, instead of values, the parameter names are followed
+    by hyperopt distributions.
+    Each hyperopt distribution also must have a name,
+    due to hyperopt functionality.
+
+    Here, we set the hyperopt distribution name to the step name,
+    but it does not have to be so. Hyperopt distribution names
+    must be different for different elements of the space.
+    """
+
+    pipeline_dist_1["params"] = {
+            'kbest__k': hp.choice('kbest__k', range(1, 5)),
+
+            'xgb__n_estimators':
+            50 + hp.randint('xgb__n_estimators', 50),
+
+            "xgb__learning_rate":
+            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
+            }
+
+    space.append(pipeline_dist_1)
+
+    pipeline_dist_2 = {}
+    pipeline_dist_2["name"] = "PCA_SVC"
+
+    pipeline_dist_2["pipeline"] = Pipeline([
+                                     ('pca', PCA()),
+                                     ('svc', SVC(gamma="scale"))
+                                     ])
+
+    pipeline_dist_2["params"] = {
+            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
+
+            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
+            }
+
+    space.append(pipeline_dist_2)
+
+    space = hp.choice('pipelines', space)
+
+    # TESTING ##########################################################
+
+    trials_path = 'TEST_hyperopt_trials.pkl'
+
+    doc_path = 'TEST_hyperopt_doc.xlsx'
+
+    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
+                                       greater_is_better=True,
+                                       trials_path=trials_path)
+
+    hp_obj.attach_data(X_train=X, y_train=y)
+
+    hp_obj.attach_space(space=space)
+
+    hp_obj.search_for_best_pipeline(niter=10)
+
+    print('\n', '='*20, 'TESTING', '='*20)
+
+    print('\n', 'Best score:', hp_obj.best_trial_score)
+
+    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
+
+    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
+
+    print('\n', 'Best 3 pipelines: \n')
+    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
+
+    print('\n', 'Best pipeline per type: \n')
+    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
+
+    hp_obj.write_trials_documentation(path=doc_path)
+
+    # os.remove(doc_path)
+    # os.remove(trials_path)

+ 130 - 0
import_process_instances/CleanProcessTable.py

@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 30 08:55:56 2019
+
+@author: tanya
+"""
+
+import pandas as pd
+import numpy as np
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from libraries.db_migration.MigrationCleaning import MigrationCleaning
+
+
+class CleanTable(MigrationCleaning):
+    '''
+    '''
+
+    def __init__(self, mapping_path: str,
+                 inconsist_report_table: str,
+                 filter_index_columns: (str, list),
+                 sort_columns: list = None,
+                 index_columns: list = None,
+                 log_name: str = "CleanProcessTable"):
+        '''
+        '''
+        super().__init__(
+                mapping_path=mapping_path,
+                schema_paths=[os.path.join(".", "mongo_schema",
+                                           "schema_process_instances.json"),
+                              os.path.join(".", "mongo_schema",
+                                           "schema_wheelsets.json"),
+                              os.path.join(".", "mongo_schema",
+                                           "schema_components.json")],
+                inconsist_report_table=inconsist_report_table,
+                filter_index_columns=filter_index_columns,
+                log_name=log_name)
+
+        self._tablename = os.path.basename(self._mapping_path)\
+                            .split("_mapping")[0]
+
+        self._sort_columns = sort_columns
+        self._index_columns = index_columns
+
+        from libraries.db_handlers.SQLHandler import SQLHandler
+
+        self._sql_db = SQLHandler()
+
+    def read_data(self, wheelsets):
+        '''
+        '''
+        if len(wheelsets) > 1:
+            query = "SELECT * FROM {0} WHERE radsatznummer in {1}"\
+                    .format(self._tablename, tuple(wheelsets))
+        else:
+            query = "SELECT * FROM {0} WHERE radsatznummer = '{1}'"\
+                    .format(self._tablename, wheelsets[0])
+
+        return self._sql_db.read_sql_to_dataframe(query)
+
+    def drop_duplicated_entries(self, data: pd.DataFrame,
+                                columns_to_ignore: list = None
+                                ) -> pd.DataFrame():
+        '''
+        '''
+        if columns_to_ignore is None:
+            columns_to_ignore = ["ende_der_bearbeitung"]
+
+        self.error_column_abscence(columns=columns_to_ignore, data=data)
+
+        defining_columns = [c for c in data.columns
+                            if c not in columns_to_ignore]
+
+        return data.drop_duplicates(subset=defining_columns)\
+                   .reset_index(drop=True)
+
+    @property
+    def field_mapping(self):
+        '''
+        '''
+        return self._mapping_parser.get_field_mapping()
+
+
+class CleanProcessTable(CleanTable):
+    '''
+    '''
+    def __init__(self, mapping_path: str,
+                 inconsist_report_table: str = None,
+                 filter_index_columns=["radsatznummer"],
+                 sort_columns: list = None,
+                 index_columns: list = None,
+                 log_name: str = "CleanProcessTable"):
+        '''
+        '''
+        super().__init__(
+                mapping_path=mapping_path,
+                sort_columns=sort_columns,
+                index_columns=index_columns,
+                inconsist_report_table=inconsist_report_table,
+                filter_index_columns=filter_index_columns,
+                log_name=log_name)
+
+    def _get_next_station_start_time(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        self.error_column_abscence(columns=["radsatznummer", "positionsnummer",
+                                            "begin_der_bearbeitung"],
+                                   data=data)
+
+        data.sort_values(by=["radsatznummer", "begin_der_bearbeitung"],
+                         inplace=True)
+
+        start_time_next_station =\
+            data.groupby("radsatznummer")["begin_der_bearbeitung"].shift(-1)\
+                .fillna("temp")
+
+        station_change = (data.groupby("radsatznummer")["positionsnummer"]
+                              .shift(-1) != data["positionsnummer"])
+
+        start_time_next_station.loc[~station_change] = np.nan
+
+        start_time_next_station.fillna(method="bfill", inplace=True)
+
+        start_time_next_station.loc[start_time_next_station == "temp"] = np.nan
+
+        return pd.to_datetime(start_time_next_station)

+ 87 - 0
import_process_instances/CleanRs0.py

@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 30 10:14:46 2019
+
+@author: tanya
+"""
+
+import pandas as pd
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from libraries.import_process_instances.CleanProcessTable import CleanTable
+
+
+class CleanRs0(CleanTable):
+    '''
+    '''
+    def __init__(self):
+        '''
+        '''
+        super().__init__(
+                mapping_path=os.path.join(".", "migration_mappings",
+                                          "rs0_mapping.json"),
+                inconsist_report_table="inconsist_rs0",
+                filter_index_columns=["radsatznummer"],
+                sort_columns=["radsatznummer", "eingabe_datum"],
+                index_columns=["radsatznummer", "eingabe_datum"],
+                log_name="CleanRs0:")
+
+    def restrict_to_process_data(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        process_columns = ["radsatznummer", "aufarbeitungstyp", "ihs",
+                           "befundung_code_1", "befundung_code_2",
+                           "befundung_code_3"]
+
+        self.error_column_abscence(columns=process_columns,
+                                   data=data)
+
+        return data[process_columns]
+
+    def add_ist_schrott(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        mongo_name = "final_state.ist_schrott"
+
+        self.error_column_abscence(columns=["aufarbeitungstyp"],
+                                   data=data)
+
+        data[mongo_name] = (data["aufarbeitungstyp"] == 2)
+
+        return data
+
+    def restrict_to_meta_data(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        meta_columns = [c for c in data.columns if c not in
+                        ["aufarbeitungstyp", "ihs",
+                         "befundung_code_1", "befundung_code_2",
+                         "befundung_code_3"]]
+
+        self.error_column_abscence(columns=meta_columns,
+                                   data=data)
+
+        return data[meta_columns]
+
+    def filter_invalid_metacolumns(self, data: pd.DataFrame,
+                                   metacolumns: list = None) -> pd.DataFrame:
+        '''
+        '''
+        if metacolumns is None:
+            metacolumns = ["wellentype", "Lagerbauart", "tauschgruppe"]
+
+        for column in metacolumns:
+
+            invalid_mask = data[column].isnull()
+
+            reason = "Missing {}".format(column)
+
+            data = self._filter_invalid_data(invalid_mask=invalid_mask,
+                                             reason=reason,
+                                             data=data)
+
+        return data

+ 170 - 0
import_process_instances/CleanRs1.py

@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 30 09:59:54 2019
+
+@author: tanya
+"""
+
+import gc
+import pandas as pd
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
+
+
+class CleanRs1(CleanProcessTable):
+    '''
+    '''
+    def __init__(self):
+        '''
+        '''
+        super().__init__(
+                mapping_path=os.path.join(".", "migration_mappings",
+                                          "rs1_mapping.json"),
+                inconsist_report_table="inconsist_rs1",
+                sort_columns=["radsatznummer", "begin_der_bearbeitung"],
+                index_columns=["radsatznummer"],
+                log_name="CleanRs1")
+
+    def clean_ende_der_bearbeitung(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        We filter all the data that has missing begin_der_bearbeitung
+         (these cases should be very rare),
+         if ende_der_bearbeitung is missing we should fill it by the
+         begin_der_bearbeitung of the next station.
+        '''
+        self.error_column_abscence(columns=["radsatznummer",
+                                            "ende_der_bearbeitung",
+                                            "begin_der_bearbeitung",
+                                            "status"],
+                                   data=data)
+
+        for time_column in ["ende_der_bearbeitung", "begin_der_bearbeitung"]:
+            data[time_column] = pd.to_datetime(data[time_column])
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        start_time_next_station = self._get_next_station_start_time(data=data)
+
+        data["ende_der_bearbeitung"].fillna(start_time_next_station,
+                                            inplace=True)
+
+        del start_time_next_station
+        gc.collect()
+
+        return data
+
+    def filter_invalid_ende_der_bearbeitung(self, data: pd.DataFrame
+                                            ) -> pd.DataFrame:
+        '''
+        '''
+        is_invalid = (
+                (data["ende_der_bearbeitung"].isnull() &
+                 (data["status"] != "Aktiv")) |
+                (data["begin_der_bearbeitung"].isnull()) |
+                (data["ende_der_bearbeitung"] < data["begin_der_bearbeitung"]))
+
+        data = self._filter_invalid_data(
+                    data=data,
+                    invalid_mask=is_invalid,
+                    reason="invalid ende der bearbeitung")
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        return data
+
+    def filter_invalid_status(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        We filter out the cases when work at a station was finished
+         with the status "Aktiv" or "Abbruch". An exception is the very last
+         station per wheel-set because it can a non-finished process.
+        '''
+        self.error_column_abscence(columns=["radsatznummer",
+                                            "positionsnummer",
+                                            "status"],
+                                   data=data)
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        is_station_change = (data["positionsnummer"] !=
+                             data["positionsnummer"].shift(-1))
+
+        is_last_station = (data["radsatznummer"] !=
+                           data["radsatznummer"].shift(-1))
+
+        has_invalid_status = (
+                is_station_change &
+                (~is_last_station) &
+                (data["status"].isin(["Aktiv", "Abbruch"])))
+
+        data = self._filter_invalid_data(
+                    data=data,
+                    invalid_mask=has_invalid_status,
+                    reason="invalid status")
+
+        return data
+
+    def add_finished(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        We add a variable indicating if the process is finished or not
+        '''
+        mongo_name = "final_state.finished"
+
+        self.error_column_abscence(columns=["radsatznummer", "status"],
+                                   data=data)
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        not_finished = ["Aktiv", "Abbruch"]
+
+        last_status_map = data.groupby("radsatznummer")["status"].last()
+
+        data[mongo_name] = ~data["radsatznummer"].map(last_status_map)\
+                                                 .isin(not_finished)
+
+        return data
+
+    def add_stage(self, data: pd.DataFrame) -> pd.DataFrame():
+        '''
+        In the configuration we store the process stages definition in the form
+         of the graph.
+        '''
+        from libraries.configuration import default as cfg
+
+        mongo_name = "process.stage"
+
+        self.error_column_abscence(columns=["radsatznummer", "positionsname"],
+                                   data=data)
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        def cumsum_str(x):
+            return x.cumsum()
+
+        def break_cum_string_to_list(x):
+            return [int(st) for st in x.split("|")[:-1]]
+
+        previous_stations = data\
+            .assign(positionsnummer=data.positionsnummer.astype(str).add("|"))\
+            .groupby("radsatznummer")["positionsnummer"]\
+            .apply(cumsum_str)\
+            .apply(break_cum_string_to_list)
+
+        for stage in cfg.process_stages.nodes():
+            this_stage_stations = cfg.process_stages.nodes()[stage]["stations"]
+            next_stage_stations = [item for next_stage
+                                   in cfg.process_stages.successors(stage)
+                                   for item in cfg.process_stages.nodes()
+                                   [next_stage]["stations"]]
+
+            def check_stage(x):
+                return (len(set(this_stage_stations) & set(x)) != 0) and \
+                       (len(set(next_stage_stations) & set(x)) == 0)
+
+            data.loc[previous_stations.apply(check_stage), mongo_name] = stage
+
+        return data

+ 82 - 0
import_process_instances/CleanRs2.py

@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 30 10:06:48 2019
+
+@author: tanya
+"""
+
+import pandas as pd
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
+
+
+class CleanRs2(CleanProcessTable):
+    '''
+    '''
+    def __init__(self):
+        '''
+        '''
+        super().__init__(
+                mapping_path=os.path.join(".", "migration_mappings",
+                                          "rs2_mapping.json"),
+                inconsist_report_table="inconsist_rs2",
+                sort_columns=["radsatznummer", "ende_der_bearbeitung"],
+                index_columns=["radsatznummer", "positionsnummer"],
+                log_name="CleanRs2")
+
+    def filter_invalid_ende_der_bearbeitung(self, data: pd.DataFrame
+                                            ) -> pd.DataFrame:
+        '''
+        We filter out all the rows that have missing ende_der_bearbeitung,
+         it means that the activities we planned, but not executed.
+        '''
+        self.error_column_abscence(columns=["radsatznummer",
+                                            "ende_der_bearbeitung"],
+                                   data=data)
+
+        is_invalid = (data["ende_der_bearbeitung"].isnull())
+
+        data = self._filter_invalid_data(
+                    data=data,
+                    invalid_mask=is_invalid,
+                    reason="invalid ende der bearbeitung")
+
+        data["ende_der_bearbeitung"] =\
+            pd.to_datetime(data["ende_der_bearbeitung"])
+
+        return data
+
+    def filter_invalid_taetigkeitsname(self, data: pd.DataFrame
+                                       ) -> pd.DataFrame:
+        '''
+        In the configuration we store a list of activities
+         execution of which means that the wheel-set is scrap.
+         After execution of this activities the process history should end.
+        '''
+        from libraries.configuration import default as cfg
+
+        self.error_column_abscence(columns=["radsatznummer",
+                                            "taetigkeitsname"],
+                                   data=data)
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        is_last_station = (
+            data["radsatznummer"] !=
+            data["radsatznummer"].shift(-1))
+
+        is_invalid = (
+                ~is_last_station &
+                (data["taetigkeitsname"].isin(cfg.schrott_taetigkeiten)))
+
+        data = self._filter_invalid_data(
+                    data=data,
+                    invalid_mask=is_invalid,
+                    reason="invalid taetigkeit")
+
+        return data

+ 58 - 0
import_process_instances/CleanRs70.py

@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 30 10:11:55 2019
+
+@author: tanya
+"""
+import pandas as pd
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
+
+
+class CleanRs70(CleanProcessTable):
+    '''
+    '''
+    def __init__(self):
+        '''
+        '''
+        super().__init__(
+                mapping_path=os.path.join(".", "migration_mappings",
+                                          "rs70_mapping.json"),
+                inconsist_report_table="inconsist_rs70",
+                sort_columns=["radsatznummer", "eingabe_datum"],
+                index_columns=["radsatznummer", "eingabe_datum"],
+                log_name="CleanRs70")
+
+    def filter_invalid_schadcode(self, data: pd.DataFrame) -> pd.DataFrame:
+        '''
+        In the configuration we store a list of schadcodes assignment
+         of which means that the product is scrap. No more schadcodes after
+         this schadcode should be assigned.
+        '''
+        from libraries.configuration import default as cfg
+
+        self.error_column_abscence(columns=["radsatznummer", "schadcode"],
+                                   data=data)
+
+        data.sort_values(by=self._sort_columns, inplace=True)
+
+        is_last_schadcode = (data["radsatznummer"] !=
+                             data["radsatznummer"].shift(-1))
+
+        is_invalid = (~is_last_schadcode &
+                      data["schadcode"].isin(cfg.schrott_schadcodes))
+
+        data = self._filter_invalid_data(
+                    data=data,
+                    invalid_mask=is_invalid,
+                    reason="invalid schadcode")
+
+        # XXX temporary here
+        # data["eingabe_datum"] = pd.to_datetime(data["eingabe_datum"])
+
+        return data

+ 149 - 0
import_process_instances/MergeProcessTables.py

@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep 30 10:16:23 2019
+
+@author: tanya
+"""
+
+import pandas as pd
+import numpy as np
+import os
+import sys
+
+sys.path.append(os.getcwd())
+
+from libraries.import_process_instances.CleanRs1 import CleanRs1
+
+
+class MergeProcessTables:
+    '''
+    '''
+    def merge_rs2(self, data: pd.DataFrame, rs2: pd.DataFrame) -> pd.DataFrame:
+        '''
+        Difficulty: rows that correspond to one radsatznummer and one station
+         in rs1 and rs2 are in many-to-many relation and
+         the ende_der_bearbeitung
+         for these rows often does not match in the two tables.
+
+        Rules:
+            A) We check if the end_der_bearbeitung of
+            an activity from rs2 is >=
+            begin_der_bearbeitung and <= of
+            ende_der_bearbeitung of an entry in rs1
+
+            B) If an activity (row in rs2) has ende_der_bearbeitung
+            which is later
+            than ende_der_bearbeitung of all the entries in rs1, we check if it
+            ended earlier than the begin_der_bearbeitung on the next station.
+            If it
+            is so, we assign the activity to the latest entry in rs1 for
+            this station.
+
+        Same logic applies for merging the table rs70.
+        '''
+        data = data.copy(deep=True)
+        rs2 = rs2.copy(deep=True)
+
+        station_change = (data["positionsnummer"] !=
+                          data["positionsnummer"].shift(-1))
+
+        data["order"] = data.index
+
+        common_columns = ["radsatznummer", "positionsnummer",
+                          "positionsname"]
+
+        data = pd.merge(data, rs2, how="left", on=common_columns)
+
+        start_time_next_station =\
+            CleanRs1()._get_next_station_start_time(data)\
+                      .fillna(data["ende_der_bearbeitung_x"])
+
+        start_matches = (data["ende_der_bearbeitung_y"] >=
+                         data["begin_der_bearbeitung"])
+
+        end_matches = ((data["ende_der_bearbeitung_y"] <=
+                       data["ende_der_bearbeitung_x"]) |
+                       data["ende_der_bearbeitung_y"].isnull())
+
+        end_almost_matches = ((data["ende_der_bearbeitung_y"] <=
+                               start_time_next_station) &
+                              station_change
+                              )
+
+        time_matches = (start_matches & end_matches) |\
+                       (start_matches & (~end_matches) & end_almost_matches)
+
+        rs2_columns = [c for c in rs2.columns
+                       if (c not in common_columns) and (c in data.columns)] +\
+                      [c + "_y" for c in rs2.columns
+                       if c + "_y" in data.columns]
+
+        for c in rs2_columns:
+            data.loc[~time_matches, c] = np.nan
+
+        data.sort_values(by=["radsatznummer",
+                             "begin_der_bearbeitung",
+                             "ende_der_bearbeitung_y"],
+                         inplace=True)
+
+        # we keep all the rows that were in rs1 even if there are no
+        # corresponding activities from rs2
+        keep_row = time_matches | (~data["order"].duplicated(keep="first"))
+
+        data = data.loc[keep_row].copy(deep=True).reset_index(drop=True)
+
+        data["ende_der_bearbeitung"] = data[["ende_der_bearbeitung_x",
+                                             "ende_der_bearbeitung_y"]]\
+            .max(axis=1)
+
+        data.drop(["ende_der_bearbeitung_x", "ende_der_bearbeitung_y",
+                   "order"], axis=1, inplace=True)
+
+        return data
+
+    def merge_rs70(self, data: pd.DataFrame, rs70: pd.DataFrame
+                   ) -> pd.DataFrame:
+        '''
+        '''
+        data["order"] = data.index
+
+        data = pd.merge(data, rs70, how="left", on="radsatznummer")
+
+        time_matches = (
+                (data["eingabe_datum"] >= data["begin_der_bearbeitung"]) &
+                (data["eingabe_datum"] <= data["ende_der_bearbeitung"]))
+
+        rs70_columns = [c for c in rs70.columns
+                        if (c != "radsatznummer") and (c in data.columns)] +\
+                       [c + "_y" for c in rs70.columns
+                        if c + "_y" in data.columns]
+
+        for c in rs70_columns:
+            data.loc[~time_matches, c] = np.nan
+
+        data.sort_values(by=["radsatznummer", "begin_der_bearbeitung",
+                             "eingabe_datum"], inplace=True)
+
+        keep_row = time_matches | (~data["order"].duplicated(keep="first"))
+
+        data = data.loc[keep_row]\
+                   .drop("order", axis=1)\
+                   .reset_index(drop=True)
+
+        return data
+
+    def merge_rs0(self, data: pd.DataFrame, rs0: pd.DataFrame) -> pd.DataFrame:
+        '''
+        '''
+        data = pd.merge(data, rs0, how="left", on="radsatznummer")
+
+        no_befundung_mask = (data["positionsnummer"] != 110)
+
+        for column in ["befundung_code_1",
+                       "befundung_code_2",
+                       "befundung_code_3"]:
+
+            data.loc[no_befundung_mask, column] = np.nan
+
+        return data

BIN
import_process_instances/__pycache__/CleanProcessTable.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs0.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs1.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs2.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs70.cpython-37.pyc


BIN
import_process_instances/__pycache__/MergeProcessTables.cpython-37.pyc


BIN
import_process_instances/__pycache__/parallelized_import.cpython-37.pyc


+ 74 - 0
import_process_instances/parallelized_import.py

@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Oct  1 11:15:03 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+from typing import Callable
+sys.path.append(os.getcwd())
+
+
+def get_all_wheelsets():
+    '''
+    return: list of distinct wheelset numbers in the process
+    '''
+    from libraries.db_handlers.SQLHandler import SQLHandler
+
+    sql_db = SQLHandler()
+
+    query = "SELECT DISTINCT radsatznummer FROM rs1"
+
+    return sql_db.read_sql_to_dataframe(query)["radsatznummer"].tolist()
+
+
+def parallelized_import(all_instances: list,
+                        mongo_schema_path: str,
+                        import_chunk: Callable,
+                        log_name: str = None):
+
+    from concurrent.futures import ThreadPoolExecutor
+
+    from libraries.db_handlers.MongodbHandler import MongodbHandler
+
+    from libraries.log import Log
+
+    import argparse
+
+    argparser = argparse.ArgumentParser(description='Import process instances collection')
+    argparser.add_argument('--chunksize', type=int, default=100, help="Number of wheelsets processed at a time")
+    argparser.add_argument('--max_workers', type=int, default=10, help="Number of workers in ThreadPoolExecutor")
+    args = argparser.parse_args()
+
+    log = Log(log_name)
+
+    log.info("Start application")
+    log.info("Processing {0} wheelsets at a time parallelized with {1} workers"
+             .format(args.chunksize, args.max_workers))
+
+    collection_name = os.path.basename(mongo_schema_path).strip("schema_").split(".")[0]
+
+    mongodb = MongodbHandler()
+
+    mongodb.create_collection_and_set_schema(
+            collection_name=collection_name,
+            schema_path=mongo_schema_path)
+
+    try:
+        n_chunks = len(all_instances)//args.chunksize + 1
+
+        with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
+            for i in range(n_chunks):
+                executor.submit(import_chunk,
+                                all_instances[i*args.chunksize:(i+1)*args.chunksize], i)
+
+    except Exception as e:
+        err = ("Failed to import {0} in mongodb. "
+               "Exit with error: {1}".format(collection_name, e))
+        log.error(err)
+        raise Exception(e)
+
+    log.info("Finished application")

+ 58 - 0
log.py

@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+"""
+@author: jürgen.pannosch, tanja.zolotareva
+"""
+
+import sys
+import os
+import logging
+
+
+class Log:
+    def __init__(self, name: str = None,
+                 log_file: str = None,
+                 log_level: str = "INFO",
+                 print_to_stdout: bool = True):
+        """Sets the log level and the path where the log file is stored
+
+        :param log_file: Path to the log file.
+        :param log_level: Log level."""
+
+        if name is None:
+            name = ''
+
+        self._logger = logging.getLogger(name)
+
+        if (self._logger.hasHandlers()):
+            self._logger.handlers.clear()
+
+        if log_file is None:
+            log_file = os.path.join(".", "all.log")
+
+        assert(isinstance(log_file, str)),\
+            "Parameter 'log_path' must be of string type"
+
+        formatter = logging.Formatter(
+                '\n %(name)s %(asctime)s %(levelname)s %(message)s')
+
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
+
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(formatter)
+        self._logger.addHandler(file_handler)
+
+        if print_to_stdout:
+            stream_handler = logging.StreamHandler(sys.stdout)
+            stream_handler.setFormatter(formatter)
+            self._logger.addHandler(stream_handler)
+
+        self._logger.setLevel(log_level)
+
+    def info(self, message: str):
+        self._logger.info(message)
+
+    def warning(self, message: str):
+        self._logger.warning(message)
+
+    def error(self, message: str):
+        self._logger.error(message)

+ 73 - 0
utils/ClassLogging.py

@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 27 14:20:58 2019
+
+@author: tanya
+"""
+
+import os
+import sys
+import pandas as pd
+sys.path.append(os.getcwd())
+
+
+class ClassLogging:
+    '''
+    '''
+    def __init__(self, log_name: str = None):
+        '''
+        '''
+        from libraries.log import Log
+
+        self._log = Log(log_name)
+
+    def log_and_raise(self, message):
+        '''
+        '''
+        self._log.error(message)
+
+        raise Exception(message)
+
+    def log_and_warn(self, message):
+        '''
+        '''
+        self._log.warning(message)
+
+    def check_is_file(self, path):
+        '''
+        '''
+        if not os.path.isfile(path):
+            err = "File {} not found".format(path)
+            self._log.error(err)
+            raise FileNotFoundError(err)
+
+    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
+                               error_or_warning: str):
+        '''
+        '''
+        if isinstance(columns, str):
+            columns = [columns]
+
+        for column in columns:
+
+            if column not in data.columns:
+                err = ("{} is not an internal column name".format(column))
+                getattr(self._log, error_or_warning)(err)
+
+                if error_or_warning == "error":
+                    raise Exception(err)
+
+    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame):
+        '''
+        '''
+        return self._check_column_abscence(columns=columns,
+                                           data=data,
+                                           error_or_warning="error")
+
+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
+        '''
+        '''
+        return self._check_column_abscence(columns=columns,
+                                           data=data,
+                                           error_or_warning="warning")

+ 62 - 0
utils/CleaningUtils.py

@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Sep 27 16:20:03 2019
+
+@author: tanya
+"""
+
+import pandas as pd
+import numpy as np
+
+
+class CleaningUtils:
+    '''
+    '''
+    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
+        '''
+        '''
+        formats = list(formats)
+
+        converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
+
+        for formt in formats:
+            if formt == "%d%m%Y":
+                missing_leading_zero = (series.astype(str).str.len() == 7)
+
+                series = series.astype(str)
+
+                series.loc[missing_leading_zero] = "0" +\
+                    series.loc[missing_leading_zero]
+
+            converted_this_format = pd.to_datetime(series,
+                                                   format=formt,
+                                                   errors="coerce")
+
+            converted.fillna(converted_this_format, inplace=True)
+
+        return converted
+
+    def standarize_writing(self, s: str):
+        '''
+        '''
+        import re
+
+        german_character_mapping = {"ß": "ss",
+                                    "ü": "ue",
+                                    "Ü": "Ue",
+                                    "ä": "ae",
+                                    "Ä": "Ae",
+                                    "ö": "oe",
+                                    "Ö": "Oe"}
+
+        s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
+        for char, correct_char in german_character_mapping.items():
+            s = s.replace(char, correct_char)
+
+        s = s.lower()
+
+        s = re.sub('[^0-9a-zA-Z]+', '_', s)
+
+        return s
+

BIN
utils/__pycache__/ClassLogging.cpython-37.pyc


BIN
utils/__pycache__/CleaningUtils.cpython-37.pyc