tsteuer hace 5 años
padre
commit
81b0368818
Se han modificado 43 ficheros con 0 adiciones y 5185 borrados
  1. 0 1
      cdplib/__init__.py
  2. BIN
      cdplib/__pycache__/__init__.cpython-37.pyc
  3. 0 269
      cdplib/feature_engineering/StatisticalFeatures.py
  4. 0 77
      cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
  5. 0 53
      cdplib/feature_engineering/StatisticalFeaturesOverTime.py
  6. 0 15
      cdplib/feature_engineering/Test.py
  7. 0 2
      cdplib/feature_engineering/__init__.py
  8. 0 798
      cdplib/hyperopt/HyperoptPipelineSelection.py
  9. 0 211
      db_handlers/MongodbHandler.py
  10. 0 595
      db_handlers/SQLHandler.py
  11. BIN
      db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
  12. BIN
      db_handlers/__pycache__/SQLHandler.cpython-37.pyc
  13. BIN
      db_handlers/__pycache__/SQLOperations.cpython-37.pyc
  14. 0 352
      db_migration/DataFrameToCollection.py
  15. 0 520
      db_migration/MigrationCleaning.py
  16. 0 62
      db_migration/ParseDbSchema.py
  17. 0 332
      db_migration/ParseJsonSchema.py
  18. 0 157
      db_migration/ParseMapping.py
  19. BIN
      db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
  20. BIN
      db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
  21. BIN
      db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
  22. BIN
      db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
  23. BIN
      db_migration/__pycache__/ParseMapping.cpython-37.pyc
  24. 0 798
      hyperopt/HyperoptPipelineSelection.py
  25. 0 130
      import_process_instances/CleanProcessTable.py
  26. 0 87
      import_process_instances/CleanRs0.py
  27. 0 170
      import_process_instances/CleanRs1.py
  28. 0 82
      import_process_instances/CleanRs2.py
  29. 0 58
      import_process_instances/CleanRs70.py
  30. 0 149
      import_process_instances/MergeProcessTables.py
  31. BIN
      import_process_instances/__pycache__/CleanProcessTable.cpython-37.pyc
  32. BIN
      import_process_instances/__pycache__/CleanRs0.cpython-37.pyc
  33. BIN
      import_process_instances/__pycache__/CleanRs1.cpython-37.pyc
  34. BIN
      import_process_instances/__pycache__/CleanRs2.cpython-37.pyc
  35. BIN
      import_process_instances/__pycache__/CleanRs70.cpython-37.pyc
  36. BIN
      import_process_instances/__pycache__/MergeProcessTables.cpython-37.pyc
  37. BIN
      import_process_instances/__pycache__/parallelized_import.cpython-37.pyc
  38. 0 74
      import_process_instances/parallelized_import.py
  39. 0 58
      log.py
  40. 0 73
      utils/ClassLogging.py
  41. 0 62
      utils/CleaningUtils.py
  42. BIN
      utils/__pycache__/ClassLogging.cpython-37.pyc
  43. BIN
      utils/__pycache__/CleaningUtils.cpython-37.pyc

+ 0 - 1
cdplib/__init__.py

@@ -1 +0,0 @@
-from .feature_engineering import Test

BIN
cdplib/__pycache__/__init__.cpython-37.pyc


+ 0 - 269
cdplib/feature_engineering/StatisticalFeatures.py

@@ -1,269 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-""" 
-Created on Tue Oct 16 16:08:47 2018
-
-@author: tanya
-"""
-import types
-import logging
-import pandas as pd
-
-from collections import defaultdict
-from functools import reduce
-
-from libraries.logging.logging_utils import configure_logging
-from libraries.exception_handling import InputChecks
-          
-class StatisticalFeatures:
-    '''
-    Groups data by index columns and returns aggregated statistics for given columns
-    
-    :param list of tuples or dict index_cols: 
-        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
-                                             (colname_2, aggfunc_3)]
-        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
-        where colname_i is column to aggregate and aggfunc_i are either 
-        function variables or strings accepted by pandas for built-in function names.
-        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
-        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
-        then only the first one is kept.
-        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
-        
-    '''
-    def __init__(self, data, index_cols, path_to_log = None):
-        '''
-        '''
-        configure_logging(path_to_log)
-            
-        self.logger = logging.getLogger(__name__)
-        
-        self.checks = InputChecks(logger = self.logger)
-        
-        self.data = data
-        
-        self.checks.assert_correct_type({'data', [pd.DataFrame]})
-            
-        self.index_cols = index_cols
-        
-        # make warning about missing values in index columns
-        for col in self.index_cols:
-            if data[col].isnull().any():
-                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
-        
-    def get_kpis_by_aggregation(self, kpis):
-        '''
-        Aggregates given fields with given aggregation functions
-         USE CASE: per product find mean and standard variation of a price
-        
-        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
-         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
-         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
-         
-        :return: features with index- and kpi- columns
-        :rtype: pandas DataFrame
-        '''
-        def get_valid_agg_dict_from_kpis(kpis):
-            '''
-            Filters inputs of incorrect shape or type,
-            Filters out columns not present in data
-            Removes multiple functions with the same name
-            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
-            Reports to the log
-            :param list or dict kpis:
-            '''
-            def get_name(x):
-                '''
-                Returns function name for function and does nothing for string
-                '''
-                if isinstance(x, types.FunctionType):
-                    return x.__name__
-                else:
-                    return x
-                
-            def passed_first_line_type_control(col, aggfunc):
-                '''
-                Checks if aggregation works on the first 2 lines of the data
-                '''
-                try:
-                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
-                    self.data.iloc[:2]\
-                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
-                             .groupby(self.index_cols)\
-                             .agg({col : aggfunc})
-                    return True
-                except Exception as e:
-                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
-                    return False
-           
-            
-            
-            valid_kpi_dict = defaultdict(list)
-            
-            if isinstance(kpis, list):
-                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
-                if sum(incorrect_lengths) > 0:
-                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
-                
-                cols = list(zip(*kpis))[0]             
-                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
-            elif isinstance(kpis, dict):
-                cols = list(kpis.keys())
-                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
-                
-            cols_not_in_data = set(cols) - set(self.data.columns)
-            if len(cols_not_in_data) > 0:
-                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
-                
-            for col, aggfuncs in kpis:
-                if not isinstance(aggfuncs, list):
-                    aggfuncs = [aggfuncs]
-                
-                for aggfunc in aggfuncs:
-                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
-                    if not is_new_funcname:
-                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
-                    
-                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
-                        valid_kpi_dict[col].append(aggfunc)
-                    
-            return valid_kpi_dict
-                   
-        
-        
-        
-        agg_dict = get_valid_agg_dict_from_kpis(kpis)
-        
-        if len(agg_dict) > 0:
-        
-            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
-                             else '_'.join([col, str(aggfunc)]) 
-                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
-            
-            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
-            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
-                       .groupby(self.index_cols)\
-                       .agg(agg_dict)\
-                       .set_axis(new_names, axis = 'columns', inplace = False)\
-                       .reset_index()
-        else:
-            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
-        
-        
-        
-        
-        
-        
-        
-    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
-        '''
-        A wrapper crosstab method with index equal to index_cols
-        USE CASE: per product find standart variation of the price in each city
-        
-        :param str pivot_col: column values of which become columns in the output
-        :param str value_col: column name to fillin vlaues
-        :param str or func aggfunc: count if None
-        :param list entries: values of pivot_col to show
-        :return: table with index- and kpi- columns
-        :rtype: pandas DataFrame
-        '''
-        
-        # assert that types of the inputs are correct
-        types_to_check = {'columns' : [str], 
-                          'value_col' : [str, type(None)],  
-                          'aggfunc' : ['str', types.FunctionType, type(None)], 
-                          'entries' : [list, type(None)]}
-        
-        self.checks.assert_correct_type(types_to_check)
-        
-        cols_to_check = [pivot_col]
-        if not value_col is None:
-            cols_to_check.append(value_col)
-        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
-
-        if not entries is None:
-            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
-        else:
-            entry_filter = pd.Series([True]*len(self.data))              
-    
-        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
-        columns = self.data.loc[entry_filter, pivot_col]
-        if not value_col is None:
-            value_col = self.data.loc[entry_filter, value_col]
-                        
-        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
-        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
-                       .reset_index()
-        return result
-    
-
-
-
-
-        
-    
-    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
-        '''
-        Aggregates values obtained with method get_value_stats
-         USE CASE: per product find average variation of the price over all cities
-         
-        :param str pivot_col:
-        :param str value_col:
-        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
-        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
-        :param list entries: 
-        :return: table with index- and kpi- columns
-        :rtype: pandas DataFrame
-        '''
-        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
-        
-        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
-
-        result = value_stat_kpis[self.index_cols].copy(deep = True)
-        
-        for aggfunc in aggfuncs_step2:
-            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
-            
-            if isinstance(aggfunc, str):
-                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
-            else:
-                result[colname] = value_stat_kpis.set_index(self.index_cols)\
-                                                 .apply(aggfunc, axis = 1)\
-                                                 .reset_index(drop = True)
-                                                 
-        return result
-                              
-                              
-                              
-                              
-                                                            
-    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
-        '''
-        Finds argmin or argmax of a column
-         USE CASE: per product find the city with maximum variation of the price
-        
-        :param str min_or_max: must be in ['min', 'max']
-        :param str pivot_col:
-        :param str value_col:
-        :param str aggfunc:    
-        '''
-        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
-        
-        if min_or_max == 'max':
-            aggfuncs_step2 = ['idxmax']
-        else:
-            aggfuncs_step2 = ['idxmin']
-            
-        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
-                                                   value_col = value_col, 
-                                                   aggfunc_step1 = aggfunc, 
-                                                   aggfucs_step2 = aggfuncs_step2)
-        
-        
-        
-        
-    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
-    
-    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
-    
-    # write tests for all methods

+ 0 - 77
cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py

@@ -1,77 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Nov  7 15:11:21 2018
-
-@author: tanya
-"""
-
-import pandas as pd
-
-from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
-
-
-class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
-    '''
-    '''
-    
-    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
-        '''
-        '''
-        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
-                                                                   index_cols = index_cols,
-                                                                   date_col = date_col,
-                                                                   split_date = split_date,
-                                                                   period_length = n_periods*period_length,
-                                                                   past_or_future = past_or_future,
-                                                                   freq = freq,
-                                                                   path_to_log)
-        
-        self.period_number_col = 'period_number'
-        while period_number_col in data.columns:
-            self.period_number_col += '&'
-        
-        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
-                            .groupby(index_cols)[date_col].cumcount()\
-                            .reset_index()\
-                            .assign(period_number = lambda x: x[0]/period_length)\
-                            .rename(columns = {'period_number' : self.period_number_col})
-                                       
-                
-        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
-                            
-        self.initial_index_cols = self.index_cols.copy()
-        self.index_cols.append(self.period_number_col)
-        
-        
-    def _aggregate_over_time_periods(df):
-        '''
-        '''
-        return df.drop(self.period_number_col, axis = 1)\
-                 .groupby(self.initial_index_cols)\
-                 .mean()\
-                 .reset_index()
-        
-        
-    def get_kpis_by_aggregation(self, **args):
-        '''
-        '''
-        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
-                                                      .get_kpis_by_aggregation(**args))
-            
-            
-    def get_value_stats(self, **args):
-        '''
-        '''
-        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
-                                                 .get_value_stats(**args))
-        
-        
-    def get_aggregated_value_stats(self, args):
-        '''
-        '''
-        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
-                                                 .get_aggregated_value_stats(**args))
-        
-    
-        

+ 0 - 53
cdplib/feature_engineering/StatisticalFeaturesOverTime.py

@@ -1,53 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Nov  7 14:02:18 2018
-
-@author: tanya
-"""
-
-import logging
-import pandas as pd
-
-from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
-from libraries.exception_handling import InputChecks, InputCasts
-from libraries.logging.logging_utils import configure_logging
-
-class StatisticalFeaturesOverTime(StatisticalFeatures):
-    '''
-    '''
-    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
-        '''
-        '''
-        configure_logging(path_to_log)
-        self.logger = logging.getLogger(__name__)
-        self.checks = InputChecks(logger = self.logger)
-        self.casts = InputCasts(logger = self.logger)
-        
-        self.checks.assert_column_presence(data = data, colnames = [date_col])
-        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
-        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
-        
-        
-        if past_or_future == 'past':
-            if not period_length is None:
-                min_date = split_date - pd.DateOffset(**{freq : period_length})
-            else:
-                min_date = data[date_col].min()
-            sup_date = split_date
-        else:
-            min_date = split_date
-            if not period_length is None:
-                sup_date = split_date + pd.DateOffset(**{freq : period_length})
-            else: 
-                sup_date = split_date + pd.DateOffset(**{freq : 1})
-            
-        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
-        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
-        
-            
-        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
-        
-        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
-                                                    index_cols = index_cols,
-                                                    path_to_log = path_to_log)

+ 0 - 15
cdplib/feature_engineering/Test.py

@@ -1,15 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Oct  7 09:59:23 2019
-
-@author: thorstensteuer
-"""
-
-class Test:
-  
-  	def __init__(self):
-		self.name=Thorsten 
-    
-    def say_name(self):
-        print("Hallo"+self.name)

+ 0 - 2
cdplib/feature_engineering/__init__.py

@@ -1,2 +0,0 @@
-from .Test import Test
-from .StatisticalFeatures import StatisticalFeatures

+ 0 - 798
cdplib/hyperopt/HyperoptPipelineSelection.py

@@ -1,798 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Nov  9 13:27:44 2018
-
-@author: tanja
-@description: Implementation of machine learning
-                pipeline selection and tuning with hyperopt library
-"""
-
-import os
-import sys
-import gc
-import logging
-import pickle
-import time
-import datetime
-
-import pandas as pd
-import numpy as np
-
-from sklearn.pipeline import Pipeline
-
-from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
-    space_eval, pyll
-
-from sklearn.model_selection import cross_validate
-
-
-class HyperoptPipelineSelection:
-    '''
-    Use this class to perform a search
-    for a machine learning pipeline in a given parameter space.
-    The parameter space can include multiple types of Pipelines
-    (SVM, XGBOOST, random forest, etc),
-    as well as parameter distributions for each pipeline parameter.
-    See example in main for the expected space structure.
-
-    The search can be performed either randomly
-    or with a tree-based algorithm. (Other methods are currently
-    developped by hyperopt creators).
-
-    Attribute trials is responsible for book-keeping parameter
-    combinations that have already been tried out. This attribute
-    is saved to a binary file every n minutes as well as every time
-    a better pipeline was found.
-    '''
-    def __init__(self,
-                 cost_func,
-                 greater_is_better: bool,
-                 trials_path: str,
-                 backup_trials_freq: int = 1,
-                 log_path: str = None,
-                 averaging_func: callable = None):
-        '''
-        :param callable cost_func: function to minimize or maximize
-
-        :param bool greater_is_better: when True
-            cost_func is maximized, else minimized.
-
-        :param str trials_path: path at which the trials object is saved
-            in binary format. From the trials object we can
-            select information about the obtained scores, score variations,
-            and pipelines, and parameters tried out so far. If a trials object
-            already exists at the given path, it is loaded and the
-            search is continued, else, the search is started from
-            the beginning.
-
-        :param backup_trials_freq: frequecy in interations (trials)
-            of saving the trials object at the trials_path.
-
-        :param str log_path: Optional, when not provided logs to stdout.
-
-        :param callable averaging_func: optional,
-            when not provided set to mean. Function
-            to aggregate the cross-validated values of the cost function.
-            Classic situation is to take the mean,
-            another example is, for example mean() - c*var().
-        '''
-
-        assert(callable(cost_func)),\
-            "Parameter 'cost_func' must be a callable"
-
-        assert(isinstance(greater_is_better, bool)),\
-            "Parameter 'greater_is_better' must be bool type"
-
-        assert(isinstance(trials_path, str)),\
-            "Parameter 'trials_path' must be of string type"
-
-        if averaging_func is not None:
-            assert(callable(averaging_func)),\
-                "Parameter 'averaging_func' must be a callable"
-
-        self._assert_valid_directory(path=trials_path)
-
-        self._configer_logger(log_path)
-
-        self._cost_func = cost_func
-        # is 1 when cost_func is minimized, -1 when cost func is maximized
-        self._score_factor = (not greater_is_better) - greater_is_better
-        self._trials_path = trials_path
-        # is initialized with empty trials object
-        self._trials = Trials()
-        self._backup_trials_freq = backup_trials_freq
-        self._averaging_func = averaging_func or np.mean
-        # keeping track of the current search iteration
-        self._run_number = 0
-        # space and data need to be attached to perform search.
-        self._space_attached = False
-        self._data_attached = False
-
-        # if a trials object already exists at the given path,
-        # it is loaded and the search is continued. Else,
-        # the search is started from the beginning.
-        if os.path.isfile(trials_path):
-            try:
-                with open(trials_path, "rb") as f:
-                    self._trials = pickle.load(f)
-
-                self._logger.info(("Loaded an existing trials object"
-                                   "Consisting of {} trials")
-                                  .format(len(self._trials.trials)))
-
-            except Exception as e:
-                self._logger.error(("Trials object could not be loaded. "
-                                    "Training starts from the beginning. "
-                                    "Exit with error {}").format(e))
-
-        else:
-            self._logger.info(("No existing trials object was found"
-                               "Initialized an empty trials object."))
-
-        self._best_score = self.best_trial_score
-
-    def _configer_logger(self, log_path: str = None):
-        '''
-        Can be replaced with the existing script later.
-        When log_path is not provided, logs to stdout.
-        '''
-
-        self._logger = logging.getLogger(__name__)
-
-        if (self._logger.hasHandlers()):
-            self._logger.handlers.clear()
-
-        if log_path is not None:
-            assert(isinstance(log_path, str)),\
-                "Parameter 'log_path' must be of string type"
-            self._assert_valid_directory(log_path)
-
-            handler = logging.FileHandler(log_path)
-        else:
-            handler = logging.StreamHandler(sys.stdout)
-
-        formatter = logging.Formatter(
-                '\n %(asctime)s %(levelname)s %(message)s')
-
-        handler.setFormatter(formatter)
-        self._logger.addHandler(handler)
-        self._logger.setLevel("INFO")
-
-    def _backup_trials(self):
-        '''
-        Pickles (Saves) the trials object.
-        Used in a scheduler.
-        '''
-        with open(self._trials_path, "wb") as f:
-            pickle.dump(self._trials, f)
-
-    def _assert_valid_directory(self, path: str):
-        '''
-        If the directory of a path does not exist yet,
-        creates it.
-        '''
-        assert(isinstance(path, str)),\
-            "Parameter 'path' must of str type"
-
-        dirname = os.path.dirname("path")
-
-        if len(dirname) > 0:
-            os.mkdir(dirname, exists_ok=True)
-
-    def attach_space(self, space: pyll.base.Apply = None,
-                     module_path: str = None,
-                     name: str = None):
-        '''
-        :param pyll.base.Apply space: hyperopt space where
-            the search is performed. Optional when a space
-            is loaded from a python module.
-
-        :param str module_path: path to python module
-            where the space is defined. Optional when
-            the space is provided directly.
-
-        :param str name: name of the space loaded from
-            a python module. Optional when the space
-            is provided directly.
-        '''
-        assert((space is not None) or
-               ((module_path is not None) and (name is not None))),\
-            "Either space or (module_path, name) must be provided"
-
-        if space is None:
-            for p in ["modele_path", "name"]:
-                assert(isinstance(p, str)),\
-                    "Parameter '{}' must be of str type".format(p)
-
-            assert(os.path.isfile(module_path)),\
-                "Parameter 'module_path' must be a valid file"
-
-            module, extension = os.path.splitext(os.path.basename(module_path))
-            assert(extension == ",py"),\
-                "Parameter 'space' must be read from a python file"
-
-            sys.path.insert(module_path)
-
-            try:
-                from module import name as space
-            except ImportError:
-                err = "Invalid space location or name"
-                self._logger.error(err)
-                raise Exception(err)
-
-        assert(isinstance(space, pyll.base.Apply)),\
-            "Parameter 'space' must be of hyperopt space type"
-
-        self._space = space
-        self._logger.info("Attached parameter distribution space")
-        self._space_attached = True
-
-    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
-            -> np.ndarray:
-        '''
-        Converts an DataFrame to an numpy array.
-        '''
-        if isinstance(x, np.ndarray):
-            return x
-
-        elif (isinstance(x, pd.core.frame.DataFrame))\
-                or (isinstance(x, pd.core.series.Series)):
-            return x.values
-
-        else:
-            e = 'The argument must be a numpy array or a pandas DataFrame'
-            self._logger.critical(e)
-            raise ValueError(e)
-
-    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
-                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    X_val: (pd.DataFrame, np.ndarray) = None,
-                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    cv: (list, int) = None):
-        '''
-        :param array X_train: data on which
-            machine learning pipelines are trained
-
-        :param array y_train: optional, vector with targets,
-            (not all algorithms require a targets)
-
-        :param array X_val: optional, validation data.
-            When not provided, cross-validated value
-            of the cost_func is calculated.
-
-        :param array y_val: optional, validation targets
-
-        :param list cv: list of tuples containing
-            train and validation indices or an integer representing
-            the number of folds for a random split of data
-            during cross-validation
-            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
-        '''
-
-        X_train = self._convert_to_array(X_train)
-        if y_train is not None:
-            y_train = self._convert_to_array(y_train)
-
-        if X_val is not None:
-            if cv is not None:
-                self._logger.warning(("Both validation set and cv object "
-                                      "are set. Validation score will be "
-                                      "calculated on the validation set!"))
-
-            X_val = self._convert_to_array(X_val)
-
-            train_inds = list(range(len(X_train)))
-            val_inds = list(range(len(X_train),
-                                  len(X_train) + len(X_val)))
-
-            # cost is evaluated with a cross validation function
-            # that accepts an array and a cv object with
-            # indices of the fold splits.
-            # Here we create a trivial cv object
-            # with one validation split.
-            self._cv = [(train_inds, val_inds)]
-            self._X = np.concatenate([X_train, X_val])
-
-            if y_train is not None:
-                if y_val is None:
-                    err = "Argument y_val must be provided"
-                    self._logger.critical(err)
-                    raise ValueError(err)
-                else:
-                    y_val = self._convert_to_array(y_val)
-                    self._y = np.concatenate([y_train, y_val])
-            else:
-                self._y = None
-        else:
-            if cv is None:
-                self._logger.warning(("Neither validation set nor cv object "
-                                      "are set. Validation score will be "
-                                      "calculated on 5 randomly "
-                                      "splitted folds."))
-
-            self._X = X_train
-            self._y = y_train
-            self._cv = cv
-
-        self._logger.info("Attached data")
-        self._data_attached = True
-
-    def _evaluate(self, pipeline: Pipeline) -> dict:
-        '''
-        This method is called in _objective.
-
-        Calculates the cost on the attached data.
-        This function can be overriden, when the cost
-        needs to be calculated differently,
-        for example with a tensorflow model.
-
-        :param Pipeline pipeline: machine learning pipeline
-            that will be evaluated with cross-validation
-
-        :output: dictionary with the aggregated
-            cross-validation score and
-            the score variance.
-        '''
-
-        scores = cross_validate(estimator=pipeline,
-                                X=self._X,
-                                y=self._y,
-                                cv=self._cv or 5,
-                                scoring=make_scorer(self._cost_func),
-                                error_score=np.nan)
-
-        return {'value': self._averaging_func(scores['test_score']),
-                'variance': np.var(scores['test_score'])}
-
-    def _objective(self, space_element: dict) -> dict:
-        '''
-        This method is called in search_for_best_pipeline
-        inside the hyperopt fmin method.
-
-        Uses _evaluate method.
-
-        It must take as input a space element
-        and produce an output in the form of dictionary
-        with 2 obligatory values loss and status
-        (STATUS_OK or STATUS_FAIL). Other
-        values in the output are optional and can be
-        accessed later through the trials object.
-
-        :Warning: fmin minimizes the loss,
-        when _evaluate returns a value to be maximized,
-        it should be multiplied by -1 to obtain loss.
-
-        :param dict space_element: must contain keys
-            name (with the name of the pipeline),
-            pipeline (Pipeline object),
-            params (dict of pipeline params)
-
-        :output: dictionary with keys
-            loss (minimized value),
-            status with values STATUS_OK or STATUS_FAIL
-            uderstood by hyperopt,
-            score (equal to loss or -loss),
-            score_variance,
-            timestamp (end of execution),
-            train_time: execution time
-        '''
-        assert(isinstance(space_element, dict) and
-               set(['name', 'pipeline', 'params']) <= space_element.keys())
-
-        assert(isinstance(space_element['name'], str) and
-               isinstance(space_element['pipeline'], Pipeline) and
-               isinstance(space_element['params'], dict))
-
-        start_time = time.time()
-
-        if not self._data_attached:
-            raise Exception(("Data must be attached in order "
-                             "in order to effectuate the best"
-                             "pipeline search"))
-
-        self._run_number += 1
-
-        pipeline = space_element['pipeline']
-        params = space_element['params']
-        pipeline.set_params(**params)
-
-        self._logger.info(("Run number {0}: "
-                           "Current score is {1}: "
-                           "Training pipeline {2} "
-                           "with parameters: {3}. ").format(
-                             self._run_number,
-                             self._best_score,
-                             space_element['name'],
-                             params))
-
-        try:
-            score_stats = self._evaluate(pipeline)
-            assert(not np.isnan(score_stats["value"])),\
-                "Returned null score"
-
-            if self._run_number % self._backup_trials_freq == 0:
-                self._backup_trials()
-
-            if (self._best_score != self._best_score) or\
-                self._score_factor*score_stats["value"] <\
-                    self._score_factor*self._best_score:
-
-                self._logger.info("Score got better, new best score is: {}"
-                                  .format(score_stats["value"]))
-
-                self._best_score = score_stats['value']
-
-                self._backup_trials()
-
-            end_time = time.time()
-
-            return {'loss': self._score_factor * score_stats["value"],
-                    'status': STATUS_OK,
-                    'score': score_stats["value"],
-                    'score_variance': score_stats["variance"],
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': end_time - start_time}
-
-        except Exception as e:
-
-            self._logger.warning("Trial failed with error {}".format(e))
-
-            return {'loss': np.nan,
-                    'status': STATUS_FAIL,
-                    'score': np.nan,
-                    'score_variance': np.nan,
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': np.nan}
-
-    def search_for_best_pipeline(self,
-                                 niter: int,
-                                 algo: callable = tpe.suggest):
-        '''
-        Method performing the search of the best pipeline in the given space.
-        Calls fmin function from the hyperopt library to minimize the output of
-        _objective.
-
-        :params int niter: number of search iterations
-        :param callable algo: now can only take values tpe for a tree-based
-            random search or random for random search
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        assert(isinstance(niter, int)),\
-            "Parameter 'niter' must be of int type"
-
-        # right now only two algorithms are provided by
-        assert(algo in [tpe.suggest, rand.suggest]),\
-            ("Parameter 'algo' can be now only tpe or random. "
-             "If other algorithms have been developped by "
-             "by hyperopt, plased add them to the list.")
-
-        try:
-            self._logger.info(("Starting {0} iterations of search "
-                               "additional to {1} previous"
-                               .format(niter, len(self._trials.trials))))
-
-            best = fmin(fn=self._objective,
-                        space=space,
-                        algo=algo,
-                        trials=self._trials,
-                        max_evals=len(self._trials.trials) + niter)
-
-            # print('AAAA', str(niter))
-
-            self._logger.info(
-                    "Best score is {0} with variance {1}"
-                    .format(
-                     self._trials.best_trial["result"]["score"],
-                     self._trials.best_trial["result"]["score_variance"]))
-
-            self._logger.info(("Finished {0} iterations of search.\n"
-                               "Best parameters are:\n {1} ")
-                              .format(niter,
-                                      space_eval(space, best)))
-
-            self._backup_trials()
-
-        except Exception as e:
-            raise ValueError(("Failed to select best "
-                             "pipeline! Exit with error: {}").format(e))
-
-    @property
-    def best_trial_score(self) -> float:
-        '''
-        '''
-        if len(self._trials.trials) > 0:
-            return self._trials.best_trial["result"]["score"]
-        else:
-            return np.nan
-
-    @property
-    def best_trial_score_variance(self) -> float:
-        '''
-        '''
-        if len(self._trials.trials) > 0:
-            return self._trials.best_trial["result"]["score_variance"]
-        else:
-            return np.nan
-
-    @property
-    def best_trial_pipeline(self) -> Pipeline:
-        '''
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        if len(self._trials.trials) > 0:
-
-            return space_eval(
-                    space,
-                    {k: v[0] for k, v in
-                     self._trials.best_trial['misc']['vals'].items()
-                     if len(v) > 0})["pipeline"]
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def _ith_trial_loss(self, i: int) -> float:
-        '''
-        '''
-        if len(self._trials.trials) >= i:
-            return self._trials.trials[i]['result']['loss']
-        else:
-            return np.nan
-
-    def _ith_trial_element(self, i: int, name: str) -> object:
-        '''
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        if len(self._trials.trials) >= i:
-            return space_eval(self._space,
-                              {k: v[0] for k, v in
-                               self._trials.trials[i]['misc']['vals']
-                               .items() if len(v) > 0})[name]
-
-    def _ith_trial_pipeline(self, i: int) -> Pipeline:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='pipeline')
-
-    def _ith_trial_name(self, i: int) -> str:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='name')
-
-    def _ith_trial_params(self, i: int) -> dict:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='params')
-
-    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
-        '''
-        '''
-        if len(self._trials.trials) >= i:
-            return self._trials.trials[i]["result"]["timestamp"]
-
-    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
-        '''
-        Returns the list of n best pipelines
-        documented in trials
-        '''
-        if len(self._trials.trials) > 0:
-            if losses is None:
-                losses = [self._ith_trial_loss(i)
-                          for i in range(len(self._trials.trials))]
-
-            best_n_indices = [losses.index(l)
-                              for l in sorted(list(set(losses)))[:n]]
-
-            return [self._ith_trial_pipeline(i) for i in best_n_indices]
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
-        '''
-        Returns a dictiionry where keys are pipeline names,
-        and values are lists of best pipelines with this name
-        '''
-        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
-
-        if len(self._trials.trials) > 0:
-
-            best_pipelines_per_type = {}
-            names = [self._ith_trial_name(i)
-                     for i in range(len(self._trials.trials))]
-
-            for nm in names:
-                losses = [self._ith_trial_loss(i)
-                          for i in range(len(self._trials.trials))
-                          if self._ith_trial_name(i) == nm]
-
-                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
-                                                        n=n,
-                                                        losses=losses)
-
-            return best_pipelines_per_type
-
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def write_trials_documentation(self, path: str = None):
-        '''
-        Saves an excel file with pipeline names, scores,
-        parameters, and timestamps.
-        '''
-        if len(self._trials.trials) > 0:
-            path = path or "hyperopt_trials_documentation.xlsx"
-
-            assert(isinstance(path, str)),\
-                "Parameter 'path' must be of string type"
-
-            self._assert_valid_directory(path)
-
-            names = [self._ith_trial_name(i)
-                     for i in range(len(self._trials.trials))]
-            scores = [self._score_factor*self._ith_trial_loss(i)
-                      for i in range(len(self._trials.trials))]
-            params = [self._ith_trial_params(i)
-                      for i in range(len(self._trials.trials))]
-            timestamps = [self._ith_trial_timestamp(i)
-                          for i in range(len(self._trials.trials))]
-
-        else:
-            names = []
-            scores = []
-            params = []
-            timestamps = []
-
-        pd.DataFrame({"name": names,
-                      "score": scores,
-                      "params": params,
-                      "timestamp": timestamps})\
-          .to_excel(path)
-
-
-if __name__ == '__main__':
-
-    from sklearn.metrics import roc_auc_score, make_scorer
-    from xgboost import XGBClassifier
-    from sklearn.svm import SVC
-    from sklearn.feature_selection import SelectKBest
-    from sklearn.decomposition import PCA
-    from sklearn.datasets import load_iris
-    from pprint import pprint
-
-    data = load_iris()
-    X = pd.DataFrame(data.data)
-    y = pd.Series(data.target)
-    # produce a binory variable
-    y = (y == 2).astype(int)
-    del data
-    gc.collect()
-
-    # SPACE DEFINITION ########################################
-    # (can be moved to a separate python script)
-
-    """
-    A search space must be a list of dictionaries.
-    Each dictionry must have keys:
-        name (pipeline name or type),
-        pipeline (instance of sklearn.pipeline.Pipeline),
-        params (dictionary of distributions for the parameters of
-                the pipeline that we want to tune)
-
-    Here we have a space that consists of two dictionaries:
-    KBEST_XGBOOST and PCA_SVC
-    """
-    space = []
-
-    pipeline_dist_1 = {}
-    pipeline_dist_1["name"] = "KBEST_XGBOOST"
-
-    """
-    A pipeline consists of steps (tuples).
-    Each step has a name and an algorithm.
-    This pipeline, as a first step performs
-    feature selection with SelectKBest and
-    as a second step evaluates a machine learning algo (xgboost).
-
-    Like all sklearn algorithms, a Pipeline has methods
-    fit, predict, set_params, get_params
-    """
-    pipeline_dist_1["pipeline"] = Pipeline([
-                                     ('kbest', SelectKBest()),
-                                     ('xgb', XGBClassifier())
-                                     ])
-    """
-    Pipeline parameter dictionaries must be of the form:
-    {'kbest__k': 3, xgb__n_estimators: 20},
-    each parameter name consists of the step name, __, and parameter name.
-
-    Here, instead of values, the parameter names are followed
-    by hyperopt distributions.
-    Each hyperopt distribution also must have a name,
-    due to hyperopt functionality.
-
-    Here, we set the hyperopt distribution name to the step name,
-    but it does not have to be so. Hyperopt distribution names
-    must be different for different elements of the space.
-    """
-
-    pipeline_dist_1["params"] = {
-            'kbest__k': hp.choice('kbest__k', range(1, 5)),
-
-            'xgb__n_estimators':
-            50 + hp.randint('xgb__n_estimators', 50),
-
-            "xgb__learning_rate":
-            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
-            }
-
-    space.append(pipeline_dist_1)
-
-    pipeline_dist_2 = {}
-    pipeline_dist_2["name"] = "PCA_SVC"
-
-    pipeline_dist_2["pipeline"] = Pipeline([
-                                     ('pca', PCA()),
-                                     ('svc', SVC(gamma="scale"))
-                                     ])
-
-    pipeline_dist_2["params"] = {
-            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
-
-            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
-            }
-
-    space.append(pipeline_dist_2)
-
-    space = hp.choice('pipelines', space)
-
-    # TESTING ##########################################################
-
-    trials_path = 'TEST_hyperopt_trials.pkl'
-
-    doc_path = 'TEST_hyperopt_doc.xlsx'
-
-    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
-                                       greater_is_better=True,
-                                       trials_path=trials_path)
-
-    hp_obj.attach_data(X_train=X, y_train=y)
-
-    hp_obj.attach_space(space=space)
-
-    hp_obj.search_for_best_pipeline(niter=10)
-
-    print('\n', '='*20, 'TESTING', '='*20)
-
-    print('\n', 'Best score:', hp_obj.best_trial_score)
-
-    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
-
-    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
-
-    print('\n', 'Best 3 pipelines: \n')
-    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
-
-    print('\n', 'Best pipeline per type: \n')
-    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
-
-    hp_obj.write_trials_documentation(path=doc_path)
-
-    # os.remove(doc_path)
-    # os.remove(trials_path)

+ 0 - 211
db_handlers/MongodbHandler.py

@@ -1,211 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-Created on Mon Sep 16 13:27:44 2019
-
-@author: oskar
-@description: Implementation of a database handler for abstraction of the mongodb.
-"""
-
-
-import json
-import simplejson
-import sys
-import os
-import jsonref
-
-from copy import deepcopy
-from pymongo import MongoClient
-import pandas as pd
-import numpy as np
-
-sys.path.append(os.getcwd())
-from libraries.log import Log
-from libraries.configuration import default as cfg
-
-class MongodbHandler:
-
-    '''
-
-    '''
-
-    def __init__(self, database_url: str = cfg['MONGO_DB']['URI'],
-                 database_name: str = cfg['MONGO_DB']['DATABASE_NAME']):
-        '''
-        :param str database_url: Url for the mongodb database
-        :param str database_name: Name of the database the database handler should handle
-        '''
-        assert(isinstance(database_url, str)),\
-            "Parameter 'database_url' must be a string type"
-        assert(isinstance(database_name, str)),\
-            "Parameter 'database_name' must be a string type"
-
-        self._log = Log("\nMongodbHandler script")
-
-        self._log.info('Mongodb Handler has been initialized')
-        # Connect to the MongoDB
-        self._client = MongoClient(database_url)
-        # Connect to the oebb_db database, or create it if it doesnt exist.
-        self._database = self._client[database_name]
-
-    def _read_schema(self, schema_path: str) -> dict:
-        '''
-        :param str schema_path: path to the schema file.
-        '''
-
-        assert(isinstance(schema_path, str)),\
-            "Parameter 'schema_path must be a string type"
-
-        with open(schema_path) as json_file:
-            schema = json.load(json_file)
-
-        if 'definitions' in schema:
-            schema = self._dereference_schema(schema)
-
-        return schema
-
-    def _dereference_schema(self, schema: dict) -> dict:
-        '''
-        :param dict schema: dictionary containing a schema which uses references.
-        '''
-
-        assert(isinstance(schema, dict)),\
-            "Parameter 'schema' must be a dictionary type"
-
-        schema = jsonref.loads(str(schema).replace("'", "\""))
-        schema = deepcopy(schema)
-        schema.pop('definitions', None)
-        return schema
-
-    def set_collection_schema(self, collection_name: str, schema_path: str,
-                              validation_level: str = 'moderate',validation_action: str = 'error'):
-        '''
-        :param str collection_name: name on the collection for which the schema will be set.
-        :param str schema_path: path to the schema file.
-        :param str validation_level: level of validation done by the mongodb.
-        :param str validation_action: what will happen upon validation error, warning or error message.
-        '''
-        assert(isinstance(collection_name, str)),\
-            "Parameter 'collection_name' must be a string type"
-        assert(isinstance(schema_path, str)),\
-            "Parameter 'schema_path' must be a string type"
-        assert(isinstance(validation_level, str)),\
-            "Parameter 'validation_lever' must be a string type"
-        assert(isinstance(validation_action, str)),\
-            "Parameter 'validation_action' must be a string type"
-
-        schema = self._read_schema(schema_path)
-
-        command = {
-                    'collMod': collection_name,
-                    'validator': {
-                        '$jsonSchema': schema
-                    },
-                    'validationLevel': validation_level,
-                    'validationAction': validation_action
-                    }
-
-        self._database.command(command)
-
-    def create_collection(self, collection_name):
-        '''
-        :param str collection_name: name of the collection to be created.
-        '''
-
-        assert(isinstance(collection_name, str)),\
-            "Parameter 'collection_name' must be a string type"
-
-        if collection_name not in self._database.list_collection_names():
-            self._log.info(("Collection '{}' has been created").format(collection_name))
-            return self._database.create_collection(collection_name)
-        else:
-            self._log.info(("Collection '{}' already exists").format(collection_name))
-            return self._database[collection_name]
-
-    def insert_data_into_collection(self, data: (dict, list, np.ndarray, pd.DataFrame, pd.Series),
-                                    collection_name: str,
-                                    ordered: bool = False):
-        '''
-        :param dict data: dictionary containing the data to be inserted in the collection
-        :param pymongo.database.Collection collection: The collection the data will be added to.
-        '''
-
-        allowed_types = (dict, list, np.ndarray, pd.DataFrame, pd.Series)
-
-        assert(isinstance(data, allowed_types)),\
-            "Parameter 'data' is of invalid type"
-
-        if isinstance(data, np.ndarray):
-            data = pd.DataFrame(data)
-
-        if isinstance(data, pd.DataFrame):
-
-            data = simplejson.loads(data.to_json(orient="records",
-                                                 date_format="iso"))
-
-        elif isinstance(data, pd.Series):
-
-            data = simplejson.loads(data.to_json(date_format="iso"))
-
-        if (len(data) == 1) or (isinstance(data, dict)):
-
-            if isinstance(data, pd.DataFrame) and (len(data) == 1):
-                data = data.iloc[0]
-
-            self._database[collection_name].insert_one(data)
-        else:
-            self._database[collection_name].insert_many(data, ordered=ordered)
-
-        self._log.info(('Data has been inserted into the {} collection').format(collection_name))
-
-    def create_collection_and_set_schema(self, collection_name: str, schema_path: str):
-        '''
-        :param str collection_name: name of the collection to be created.
-        :param str schema_path: path to the schema file.
-        '''
-        assert(isinstance(collection_name, str)),\
-            "Parameter 'collection_name' must be a string type"
-        assert(isinstance(schema_path, str)),\
-            "Parameter 'schema_path' must be a string type"
-
-        self.create_collection(collection_name)
-        self.set_collection_schema(collection_name=collection_name, schema_path=schema_path)
-
-    def query_data_and_generate_dataframe(self, collection_name: str, attribute: str = None,
-                                          attribute_value: str = None, comparison_operator: str = '$eq'):
-        '''
-
-        '''
-        if attribute is None or attribute_value is None:
-            data = self._database[collection_name].find()
-        else:
-            data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}})
-
-        df = pd.DataFrame(list(data))
-        df.set_index('radsatznummer', inplace=True)
-        return df
-
-
-if __name__ == "__main__":
-
-    log = Log("Test MongodbHandler:")
-
-    log.info('Script started')
-
-    db_handler = MongodbHandler()
-
-    # Create a colleciton for the wheelsets and give it its schema.
-    for schema_path in [
-            os.path.join(".", "mongo_schema", "schema_wheelsets.json"),
-            os.path.join(".", "mongo_schema", "schema_process_instances.json"),
-            os.path.join(".", "mongo_schema", "schema_componets.json")]:
-
-        if os.path.isfile(schema_path):
-
-            collection_name = os.path.basename(schema_path).lstrip("_schema").split(".")[0]
-
-            db_handler.create_collection_and_set_schema(collection_name, schema_path)
-
-    log.info(("Existing databases: {}, Collection in OEBB database {}")\
-             .format(db_handler._client.list_database_names(), db_handler._database.list_collection_names()))

+ 0 - 595
db_handlers/SQLHandler.py

@@ -1,595 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Sep 18 16:20:50 2018
-
-@author: tanya
-"""
-
-import os
-import sys
-import re
-import sqlalchemy
-import sqlparse
-import pandas as pd
-import warnings
-
-sys.path.append(os.getcwd())
-
-
-class SQLHandler:
-    '''
-    Resembles methods for executing sql queries
-    with different dabase connectors.
-    Remark:in each method we force new opening and
-    closing of a database connection,
-     this avoids errors when parallelizing with multiprocessing.
-    '''
-
-    def __init__(self, db_uri: str = None,
-                 is_case_insensitive: bool = False):
-        '''
-        :param str db_uri:
-            of form
-            <sqlalchemy_dialect//user:password@host:port/dbname?charset=utf8&local_infile=1>
-
-         sqlalchemy dialects:
-             for mysql : mysql+pymysql
-             for db2: ibm_db_sa
-        '''
-
-        from libraries.log import Log
-        from libraries.configuration import default as cfg
-        from sqlalchemy_utils import database_exists, create_database
-
-        self._log = Log(name='SQLHandler')
-
-        if db_uri is None:
-            db_uri = cfg["SQL_DB"]["URI"]
-
-        assert(isinstance(db_uri, str)),\
-            "Parameter 'db_uri' must be of type str"
-
-        assert(re.match(r'.+://.+:(.+)?@.+:.+/.+', db_uri) is not None),\
-            ('database url does not match the pattern: '
-             'sqlalchemy_dialect//user:password@host:port/dbname')
-
-        self._db_uri = db_uri
-
-        engine = sqlalchemy.create_engine(self._db_uri)
-
-        if not database_exists(engine.url):
-            create_database(engine.url)
-
-        query = "CREATE DATABASE IF NOT EXISTS {}"\
-            .format(self._connection_params["db"])
-
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            engine.execute(query)
-
-        assert(isinstance(is_case_insensitive, bool)),\
-            "Parameter 'is_case_sensetive' must of type bool"
-
-        if 'ibm' in db_uri and not is_case_insensitive:
-            raise Exception('Ibm db2 is case insensitive')
-
-        self._is_case_insensitive = is_case_insensitive
-
-        self._engine = sqlalchemy.create_engine(self._db_uri)
-
-    @property
-    def _connection_params(self) -> dict:
-        '''
-        return: connection parameters like user,
-        password, host, port, and database name
-        rtype: dict
-        '''
-        try:
-            connection_params = {}
-
-            connection_params['user'], connection_params['password'] =\
-                self._db_uri.split('//')[1]\
-                            .split('@')[0]\
-                            .split(':')
-
-            connection_params['host'], connection_params['port'] =\
-                self._db_uri.split('//')[1]\
-                            .split('@')[1]\
-                            .split('/')[0]\
-                            .split(':')
-
-            connection_params['db'] = self._db_uri.split('/')[-1]\
-                                                  .split('?')[0]
-
-            return connection_params
-
-        except Exception as e:
-            err = ("Could not parse connection parameters."
-                   "Finished with error {}")\
-                   .format(e)
-
-            self._log.error(err)
-            raise Exception(err)
-
-    def drop_database(self):
-        '''
-        '''
-        database = self._connection_params["db"]
-        self.execute("DROP DATABASE IF EXISTS {}".format(database))
-
-    @property
-    def _db_metadata(self) -> dict:
-        '''
-        Returns a sql-dialect specific information like information schema
-        and columnames in information_schema.tables and
-        information_schema.columns
-        For ibm databases, information_schema is set to syscat,
-        else it is set to information_schema
-        If these default values do not exist in the given database,
-        the output of the method is set to None
-
-        :return: dictionary with information_schema, schema_col,
-            table_col, column_col, default_schema
-        '''
-
-        db_metadata = {}
-
-        if 'ibm' in self._db_uri:
-            db_metadata['information_schema'] = 'syscat'
-            db_metadata['schema_col'] = 'tabschema'
-            db_metadata['table_col'] = 'tabname'
-            db_metadata['column_col'] = 'colname'
-            db_metadata['default_schema'] =\
-                self._connection_params['user'].upper()
-        else:
-            db_metadata['information_schema'] = 'information_schema'
-            db_metadata['schema_col'] = 'TABLE_SCHEMA'
-            db_metadata['table_col'] = 'TABLE_NAME'
-            db_metadata['column_col'] = 'COLUMN_NAME'
-            db_metadata['default_schema'] =\
-                self._connection_params['db']
-
-        # check if it worked to create metadata
-        try:
-            query = """SELECT *
-                       FROM {}.tables
-                       LIMIT 1
-                    """.format(db_metadata['information_schema'])
-            self.execute(query)
-
-        except Exception as e:
-            self._log.error(e)
-            db_metadata = None
-
-        return db_metadata
-
-    def execute(self, query):
-        '''
-        Executes an sql-queries.
-        Remark: queries like CREATE, DROP, SELECT work
-        for majority of sqlalchemy dialects.
-         queries like SHOW TABLES, LOAD DATA, and using
-         INFORMATION_SCHEMA are mysql specific and might
-         not exist in a different dialect.
-
-        :param str query:
-        '''
-        connection = self._engine.connect()
-        transaction = connection.begin()
-
-        errors = []
-
-        # in the case of multi-query execute each query
-        for sub_query in sqlparse.split(query):
-            if len(sub_query) > 0:
-                try:
-                    connection.execute(sub_query, multi=True)
-
-                except Exception as e:
-                    errors.append(str(e))
-
-        if len(errors) > 0:
-            err = ('Could not execute some of the queries. '
-                   'Obtained exceptions: {}'
-                   .format('\n'.join(errors)))
-
-            self._log.error(err)
-            raise Exception(err)
-
-        transaction.commit()
-        connection.close()
-
-    def execute_query_from_file(self, filename: str):
-        '''
-        '''
-        with open(filename, 'r') as f:
-            query = f.read()
-
-        self.execute(query)
-
-    def get_tablenames(self, schema: str = None, query: str = None):
-        '''
-        '''
-        if (self._db_metadata is None) and (query is None):
-            raise Exception('Please specify the query')
-
-        else:
-            try:
-                if query is None:
-                    schema_or_default_schema =\
-                        self._db_metadata['default_schema']\
-                        if schema is None else schema
-
-                    query = """SELECT DISTINCT {0}
-                               FROM {1}.tables
-                               WHERE {2} = '{3}'
-                            """.format(
-                            self._db_metadata['table_col'],
-                            self._db_metadata['information_schema'],
-                            self._db_metadata['schema_col'],
-                            schema_or_default_schema)
-
-                tables = self.read_sql_to_dataframe(query).iloc[:, 0].tolist()
-                return tables
-
-            except Exception as e:
-                err = ("Could not get tablenames"
-                       "Finished with error {}".format(e))
-
-                self._log.error(err)
-                raise Exception(err)
-
-    def check_if_table_exists(self, tablename: str,
-                              schema: str = None,
-                              query: str = None):
-        '''
-        Tries to retrieve table information from database with given query.
-        If this does not work, tries to select one row from the given table,
-        if this fails, assumes that the table does not exist.
-
-        :param str tablename:
-        :param str schema:
-        :param str query: if not specified, tries to find
-            tablename in information_schema specified in _db_metadata.
-        :return: if the table exists or not
-        :rtype: bool
-        '''
-        if self._is_case_insensitive:
-            tablename = tablename.upper()
-
-        try:
-            tablenames = self.get_tablenames(schema=schema, query=query)
-
-            table_exists = (tablename in tablenames)
-
-        except Exception as e:
-            self._log.warning(('Could not execute query to retrieve table '
-                               'information. Trying to execute a'
-                               'select statement. '
-                               'Got exeption {}').format(e))
-            try:
-                query = """SELECT *
-                           FROM {0}{1}
-                           LIMIT 1
-                        """.format('' if schema is None else schema + '.',
-                                   tablename)
-
-                self.execute(query)
-
-                table_exists = True
-
-            except Exception as e:
-                self._log.warning(('Failed to select from {0}. '
-                                   'Finished with error {1}'
-                                   'Conclusion: table does not exist')
-                                  .format(tablename, e))
-
-                table_exists = False
-
-        return table_exists
-
-    def create_schema(self, schema: str, query: str = None):
-        '''
-        Creates a schema if it does not exist, else does nothing
-
-        :param str schema:
-        :param str query: if None trying to read schemas from
-            information_schema specified in db_metadata
-        '''
-        if (query is None):
-
-            if self._db_metadata is None:
-                raise Exception('Please specify query')
-            else:
-                query = """SELECT DISTINCT {0}
-                           FROM {1}.tables""".format(
-                              self._db_metadata['schema_col'],
-                              self._db_metadata['information_schema'])
-
-        try:
-            schemas = self.read_sql_to_dataframe(query).iloc[:, 0].tolist()
-        except Exception as e:
-            err = ("Could not retrieve the list of schemas"
-                   "from the database. Finished with error {}"
-                   .format(e))
-
-            self._log.error(err)
-            raise Exception(err)
-
-        if schema not in schemas:
-            self.execute("CREATE SCHEMA {}".format(schema))
-
-    def drop_table_if_exists(self, tablename: str,
-                             schema: str = None,
-                             query: str = None):
-        '''
-        :param str tablename:
-        :param str schema:
-        :param str query: if not specified, default value is "DROP TABLE"
-        '''
-        if self._is_case_insensitive:
-            tablename = tablename.upper()
-
-        schema = '' if schema is None else schema + '.'
-
-        if query is None:
-            query = "DROP TABLE {0}{1};".format(schema, tablename)
-
-        try:
-            if self.check_if_table_exists(tablename):
-                self.execute(query)
-
-        except Exception as e:
-            err = ("Could not drop the table {0} ."
-                   "Finished with error {1}"
-                   .format(tablename, e))
-
-            self._log.error(err)
-            raise Exception(err)
-
-    def get_column_names(self, tablename: str,
-                         schema: str = None,
-                         query: str = None):
-        '''
-        Tries to retrieve column information from database with given query.
-        If this does not work, tries to select one row from the given table.
-
-        :param str tablename:
-        :param str schema:
-        :param str query: if not specified, tries to select column
-            names in the information_schema specified in db_metadata
-        '''
-        if self._is_case_insensitive:
-            tablename = tablename.upper()
-
-        if not self.check_if_table_exists(tablename=tablename,
-                                          schema=schema):
-
-            err = "Table {} does not exist".format(tablename)
-            self._log.error(err)
-            raise Exception(err)
-
-        try:
-            if query is None:
-                if self._db_metadata is None:
-                    raise Exception('Please specify the query')
-
-                else:
-                    schema_or_default_schema =\
-                        self._db_metadata['default_schema']\
-                        if schema is None else schema
-
-                    query = """SELECT DISTINCT {0}
-                               FROM {1}.columns
-                               WHERE {2} = '{3}'
-                               AND {4} = '{5}'
-                            """.format(
-                            self._db_metadata['column_col'],
-                            self._db_metadata['information_schema'],
-                            self._db_metadata['schema_col'],
-                            schema_or_default_schema,
-                            self._db_metadata['table_col'],
-                            tablename)
-
-            colnames = [c.lower() for c in
-                        self.read_sql_to_dataframe(query).iloc[:, 0].tolist()]
-
-        except Exception as e:
-            self._log.warn((
-                'Could not select columns from '
-                'informational schema. Trying to '
-                'load the table into a dataframe and selec column names.'
-                'Obtained exception {}').format(e))
-
-            query = """SELECT *
-                       FROM {0}{1}
-                       LIMIT 1
-                    """.format('' if schema is None else schema + '.',
-                               tablename)
-
-            data = self.execute(query)
-            colnames = data.columns.tolist()
-
-        return colnames
-
-    def load_csv_to_db(self, filename: str,
-                       tablename: str,
-                       schema: str = None,
-                       query: str = None,
-                       **kwargs):
-        '''
-        Tries to load data from csv file to database with a given query.
-        If this does not work, tries to load data from csv to a
-        pandas dataframe first, and then write it to the database.
-
-        :param str filename:
-        :param str tablename:
-        :param str schema:
-        :param str query: if not specified, tries to use
-        LOAD DATA LOCAL INFILE query
-        '''
-
-        if not self.check_if_table_exists(tablename=tablename,
-                                          schema=schema):
-
-            err = ('Table {} test does not exit.'
-                   'Please create it first').format(tablename)
-            self._log.error(err)
-            raise Exception(err)
-
-        else:
-            try:
-                if query is None:
-                    query = """LOAD DATA LOCAL INFILE '{0}'
-                               INTO TABLE {1}{2}
-                               COLUMNS TERMINATED BY ','
-                               OPTIONALLY ENCLOSED BY '"'
-                               LINES TERMINATED BY '\r\n'
-                               IGNORE 1 LINES
-                               ({3})
-                               ;""".format(
-                                   filename,
-                                   '' if schema is None else schema + '.',
-                                   tablename,
-                                   ','.join(self.get_column_names(tablename)))
-
-                self.execute(query)
-
-            except Exception as e:
-                err = ("Could not load the file {0} "
-                       "to the table {1} ."
-                       "Finished with error {2}")\
-                       .format(filename, tablename, e)
-
-                self._log.error(err)
-                raise Exception(err)
-
-    def read_sql_to_dataframe(self, query: str, **read_sql_kwargs):
-        '''
-        :param str query: normally a SELECT sql statement
-        :param read_sql_kwargs: additional arguments to pandas read_sql method
-        :return: selected data
-        :rtype: DataFrame
-        '''
-        try:
-            connection = self._engine.connect()
-
-            data = pd.read_sql(sql=query,
-                               con=connection,
-                               **read_sql_kwargs)
-
-            connection.close()
-            return data
-
-        except Exception as e:
-            err = ("Could not read the query to a dataframe. "
-                   "Finished with error {}").format(e)
-
-            self._log.error(err)
-            raise Exception(err)
-
-    def read_table(self, tablename: str,
-                   schema: str = None,
-                   **read_sql_kwargs):
-        '''
-        :param str tablename:
-        :param str schema:
-        :param read_sql_kwargs: additional arguments to pands read_sql_method
-        :return: selected table
-        :rtype: DataFrame
-        '''
-        schema = '' if schema is None else schema + '.'
-
-        try:
-            return self.read_sql_to_dataframe(
-                    query="SELECT * FROM {0}{1};".format(schema, tablename),
-                    **read_sql_kwargs)
-        except Exception as e:
-            err = ("Could not read the table {0} to a dataframe. "
-                   "Finished with error {1}").format(tablename, e)
-
-            self._log.error(err)
-            raise Exception(err)
-
-    def append_to_table(self, data: pd.DataFrame,
-                        tablename: str,
-                        schema: str = None,
-                        to_sql_kwargs={'index': False}):
-        '''
-        :param DataFrame data: data to append
-        :param str tablename: table where data is appended
-        :param str schema:
-        :param dict to_sql_kwargs: additional arguments to pandas to_sql method
-        '''
-        if schema is not None:
-            self.create_schema(schema)
-
-        try:
-            connection = self._engine.connect()
-
-            data.to_sql(name=tablename,
-                        schema=schema,
-                        con=connection,
-                        if_exists='append',
-                        **to_sql_kwargs)
-
-            connection.close()
-
-        except Exception as e:
-            err = ("Could append data to the table {0}. "
-                   "Finished with error {1}").format(tablename, e)
-
-            self._log.error(err)
-            raise Exception(err)
-
-    def overwrite_table(self, data: pd.DataFrame,
-                        tablename: str,
-                        schema: str = None,
-                        to_sql_kwargs={'index': False}):
-        '''
-        :param DataFrame data: data to write to dabase
-        :param str tablename: table where data is written
-        :param str schema:
-        :param to_sql_kwargs: additional arguments to pandas to_sql method
-        '''
-
-        if schema is not None:
-            self.create_schema(schema)
-
-        try:
-
-            connection = self._engine.connect()
-
-            data.to_sql(name=tablename,
-                        schema=schema,
-                        con=connection,
-                        if_exists='replace',
-                        **to_sql_kwargs)
-
-            connection.close()
-
-        except Exception as e:
-            err = ("Could overwrite the table {0}. "
-                   "Finished with error {1}").format(tablename, e)
-
-            self._log.error(err)
-            raise Exception(err)
-
-    def draw_er_diagram_from_db(self, diagram_path: str = None,
-                                schema: str = None,
-                                include_tables: list = None):
-        '''
-        '''
-        if diagram_path is None:
-            diagram_path = "erd.png"
-        else:
-            diagram_dir = os.path.dirname(diagram_path)
-            if diagram_dir != "":
-                os.makedirs(diagram_dir, exist_ok=True)
-
-        import eralchemy
-        eralchemy.render_er(self._db_uri,
-                            diagram_path,
-                            schema=schema,
-                            include_tables=include_tables)

BIN
db_handlers/__pycache__/MongodbHandler.cpython-37.pyc


BIN
db_handlers/__pycache__/SQLHandler.cpython-37.pyc


BIN
db_handlers/__pycache__/SQLOperations.cpython-37.pyc


+ 0 - 352
db_migration/DataFrameToCollection.py

@@ -1,352 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Jul 22 11:05:47 2019
-
-@author: tanya
-
-@description: a function to reshape a pandas dataframe to a list of
-(possibly nested) documents with respect to a (json) mongodb schema
-"""
-
-import pandas as pd
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-
-class DataFrameToCollection:
-    '''
-    '''
-    def __init__(self, schema_path: str = None, log_path: str = None):
-        '''
-        '''
-        from libraries.log import Log
-        import json
-
-        self._log = Log("ParseJsonSchema")
-
-        if schema_path is not None:
-
-            if not os.path.isfile(schema_path):
-                err = "JsonSchema not found"
-                self._log.error(err)
-                raise FileNotFoundError(err)
-
-            # load schema to dictionary if it is a valid json file
-            try:
-                with open(schema_path, "r") as f:
-                    self.schema = json.load(f)
-
-            except Exception as e:
-                err = ("Could not load json schema, "
-                       "Obtained error {}".format(e))
-
-                self._log.error(err)
-                raise Exception(err)
-
-        else:
-            self.schema = None
-
-    def to_list_of_documents(self, data: pd.DataFrame,
-                             grp_fields: list,
-                             schema: dict = None,
-                             _return_data: bool = False) -> list:
-        '''
-        Reshapes a pandas dataframe to a list of documents according
-         to a complex (json) mongodb schema
-
-         Remark1: column names of data need to reflect the "nestedness"
-         of the field in the mongodb schema with the help of a "." separator
-         Example: field.sub_field_1, field.sub_field_2
-
-         Remark2: if the schema is stored as a json file, first load it
-         to a dictionary with the help of the python json module
-        '''
-        from copy import deepcopy
-        from libraries.log import Log
-
-        log = Log("reshape_dataframe_to_list_of_documents:")
-
-        data = self._melt_duplicated_columns(data)
-
-        reshaped_fields = []
-
-        if schema is None:
-            schema = self.schema
-
-        for field in schema["properties"]:
-
-            if field not in self._unroll_nested_names(data.columns):
-                continue
-
-            field_type = schema["properties"][field]["bsonType"]
-
-            # if field has a simple type
-            if field_type not in ["array", "object"]:
-
-                grp_fields = [c for c in grp_fields if c in data.columns]
-
-                n_distinct_values = data.groupby(grp_fields)[field].nunique()\
-                                        .max()
-
-                if n_distinct_values != 1:
-                    err = "Field {0} is not unique with respect to {1}"\
-                          .format(field, grp_fields)
-
-                    log.error(err)
-                    raise Exception(err)
-
-                if field not in grp_fields:
-                    reshaped_field = data.groupby(grp_fields)[field].first()
-                else:
-                    reshaped_field =\
-                        data[grp_fields].drop_duplicates()\
-                        .set_index(grp_fields, drop=False)[field]
-
-                reshaped_fields.append(reshaped_field)
-
-            # if field is sub-document (dictionary)
-            elif field_type == "object":
-
-                sub_schema = deepcopy(schema["properties"][field])
-
-                # rename sub-schema properties to match with data column names
-                sub_schema["properties"] =\
-                    {".".join([field, k]): v for k, v
-                     in sub_schema["properties"].items()}
-
-                sub_data = self.to_list_of_documents(
-                            data=data,
-                            schema=sub_schema,
-                            grp_fields=grp_fields,
-                            _return_data=True)
-
-                reshaped_field = sub_data.apply(self._make_dict, axis=1)
-                reshaped_field.name = field
-
-                reshaped_fields.append(reshaped_field)
-
-            # if field is a list of dictionaries
-            elif field_type == "array":
-
-                items_type = schema["properties"][field]["items"]["bsonType"]
-
-                if items_type == "object":
-
-                    sub_schema = deepcopy(schema["properties"][field]["items"])
-
-                    # rename sub-schema properties to match data column names
-                    sub_schema["properties"] =\
-                        {".".join([field, k]): v for k, v in
-                         sub_schema["properties"].items()}
-
-                    # extend grp fields by sub-fields of field simple types
-                    sub_grp_fields =\
-                        [f for f in sub_schema["properties"]
-                         if sub_schema["properties"][f]["bsonType"]
-                         not in ["array", "object"]]
-
-                    if len(sub_grp_fields) == 0:
-                        err = ("One of the sub-keys in a list of documents"
-                               " must be of simple type for the field {}"
-                               .format(field))
-
-                        log.error(err)
-                        raise Exception(err)
-
-                    # group and reshape sub-fields with complex types
-                    sub_data = self.to_list_of_documents(
-                                data=data,
-                                schema=sub_schema,
-                                grp_fields=grp_fields + sub_grp_fields,
-                                _return_data=True)
-
-                    if sub_data is not None:
-
-                        # gether the results into a list of dictionaries
-                        sub_data = sub_data.apply(self._make_dict, axis=1)
-
-                        sub_data.name = field
-                        sub_data = sub_data.reset_index(grp_fields)
-
-                        reshaped_field =\
-                            sub_data.groupby(grp_fields)[field]\
-                                    .apply(self._make_list_of_distinct)
-
-                        reshaped_fields.append(reshaped_field)
-
-                # if field is a list of values with simple type
-                else:
-
-                    grp_fields = [c for c in grp_fields if c in data.columns]
-
-                    if field in data.columns:
-
-                        reshaped_field = data.groupby(grp_fields)[field]\
-                                           .apply(self._make_list_of_distinct)
-
-                        reshaped_fields.append(reshaped_field)
-
-        if len(reshaped_fields) > 0:
-            reshaped_data = pd.concat(reshaped_fields, axis=1)
-
-            if not _return_data:
-
-                list_of_documents =\
-                    reshaped_data.drop(list(reshaped_data.index.names),
-                                       axis=1, errors="ignore")\
-                                 .reset_index(drop=False)
-
-                log.info("Done reshaping the dataframe to a list of documents")
-
-                return list_of_documents
-
-            else:
-
-                return reshaped_data
-
-    def _melt_duplicated_columns(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        for c in set(data.columns):
-            if isinstance(data[c], pd.DataFrame):
-                data = pd.melt(data, id_vars=[cc for cc in data.columns
-                                              if cc != c], value_vars=c)\
-                         .drop("variable", axis=1)\
-                         .rename(columns={"value": c})
-
-        return data
-
-    def _make_dict(self, x: pd.Series) -> dict:
-        '''
-        return: transforms pandas series to a dictionary
-         is meant to be applied to a dataframe in axis = 1,
-         then the index of the input series are the column names
-         of the dataframe
-        '''
-        return {f.split(".")[-1]: x[f] for f in x.index}
-
-    def _make_list(self, x: pd.Series) -> list:
-        '''
-        return: list of values in a series
-        '''
-        return list(x)
-
-    def _make_list_of_distinct(self, x: pd.Series) -> list:
-        '''
-        return: list of unique values from a Series where
-         entries are arbitrary objects
-         (pandas unique() method does not work if entries are of complex types)
-        '''
-        distinct = []
-        [distinct.append(obj) for obj in x if obj not in distinct]
-        return distinct
-
-    def _unroll_nested_names(self, columns: list) -> list:
-        '''
-        '''
-        unrolled = []
-
-        for c in columns:
-            splitted = c.split(".")
-            for i in range(len(splitted)):
-                unrolled.append(".".join(splitted[:i+1]))
-
-        return unrolled
-
-
-if __name__ == "__main__":
-
-    # Testing
-
-    df = pd.DataFrame({
-                       "a": [1]*8 + [2]*8,
-                       "b": [10]*8 + [20]*8,
-                       "c": [100, 200]*8,
-                       "d.da": [11]*8 + [22]*8,
-                       "d.db": [33]*8 + [34]*8,
-                       "e.ea.eaa": [5]*8 + [55]*8,
-                       "e.ea.eab": [6]*8 + [66]*8,
-                       "e.eb": [2, 2, 3, 3]*4,
-                       "e.ec.eca": [1, 2, 3, 4]*4,
-                       "e.ec.ecb": [5, 6, 7, 8]*4,
-                       "f.fa": [1]*4 + [3]*4 + [11]*4 + [33]*4,
-                       "f.fb": [2]*4 + [3]*2 + [4]*2 + [22]*4 + [44]*4})
-
-    duplicate = pd.DataFrame({"c": [300, 400]*8})
-
-    df = pd.concat([df, duplicate], axis=1)
-
-    schm = {
-              "bsonType": "object",
-              "required": ["a"],
-              "properties": {
-
-                  "a": {"bsonType": "integer"},
-
-                  "b": {"bsonType": "integer"},
-
-                  "c": {
-                      "bsonType": "array",
-                      "items": {"bsonType": "integer"}
-                  },
-                  "d": {
-                      "bsonType": "object",
-                      "properties": {
-                          "da": {"bsonType": "integer"},
-                          "db": {"bsonType": "integer"}
-                       }
-                  },
-                  "e": {
-                      "bsonType": "object",
-                      "properties": {
-                          "ea": {
-                              "bsonType": "object",
-                              "properties": {
-                                  "eaa": {"bsonType": "integer"},
-                                  "eab": {"bsonType": "integer"}
-                               }
-
-                          },
-
-                          "eb": {
-                              "bsonType": "array",
-                              "items": {"bsonType": "integer"}
-                          },
-
-                          "ec": {
-                                "bsonType": "array",
-                                "items": {
-                                  "bsonType": "object",
-                                  "properties": {
-                                      "eca": {"bsonType": "integer"},
-                                      "ecb": {"bsonType": "integer"}
-                                    }
-                                  }
-                          }
-                      }
-                  },
-                  "f": {
-                      "bsonType": "array",
-                      "items": {
-                          "bsonType": "object",
-                          "properties": {
-                              "fa": {"bsonType": "integer"},
-                              "fb": {
-                                  "bsonType": "array",
-                                  "items": {"bsonType": "integer"}
-                              }
-                          }
-                      }
-                  }
-              }
-              }
-
-    grp_fields = ["a"]
-
-    result = DataFrameToCollection().to_list_of_documents(
-                    data=df,
-                    schema=schm,
-                    grp_fields=grp_fields)

+ 0 - 520
db_migration/MigrationCleaning.py

@@ -1,520 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Sep 25 08:09:52 2019
-
-@author: tanya
-"""
-
-import os
-import sys
-import pandas as pd
-import numpy as np
-import gc
-
-sys.path.append(os.getcwd())
-
-from libraries.db_migration.ParseMapping import ParseMapping
-from libraries.db_migration.ParseJsonSchema import ParseJsonSchema
-from libraries.utils.ClassLogging import ClassLogging
-from libraries.utils.CleaningUtils import CleaningUtils
-
-
-class MigrationCleaning(ClassLogging):
-    '''
-    Class for correcting and filtering the incorrect data.
-    We keep the correcting and the filtering methods separated,
-    since there might be other custom steps in between.
-    '''
-    def __init__(self, mapping_path: str,
-                 schema_paths: (str, list),
-                 inconsist_report_table: str = None,
-                 filter_index_columns: (str, list) = None,
-                 mapping_source: str = "internal_name",
-                 mapping_target: str = "mongo_name",
-                 mapping_parser: type = ParseMapping,
-                 schema_parser: type = ParseJsonSchema,
-                 log_name: str = "MigrationCleaning"):
-        '''
-        '''
-        super().__init__(log_name=log_name)
-
-        assert isinstance(inconsist_report_table, str),\
-            "Inconsistent report table should be a tablename string"
-
-        self._inconsist_report_table = inconsist_report_table
-
-        assert isinstance(filter_index_columns, (str, list)),\
-            "Filter index columns must be a str or a list"
-
-        self._filter_index_columns = list(filter_index_columns)
-
-        self._schema_parser = schema_parser(schema_paths)
-
-        self._mapping_parser = mapping_parser(mapping_path,
-                                              source=mapping_source,
-                                              target=mapping_target)
-
-        self._mapping_path = mapping_path
-        self._schema_paths = schema_paths
-
-    def _assert_dataframe_input(self, data: pd.DataFrame):
-        '''
-        '''
-        assert(isinstance(data, pd.DataFrame)),\
-            "Parameter 'data' must be a pandas dataframe"
-
-    @property
-    def _field_mapping(self):
-        '''
-        '''
-        return self._mapping_parser.get_field_mapping()
-
-    @property
-    def _required_fields(self):
-        '''
-        '''
-        source_required_fields = self._mapping_parser.get_required_fields()
-        target_required_fields = self._schema_parser.get_required_fields()
-
-        for source_field, target_field in self._field_mapping.items():
-
-            if (target_field in target_required_fields) and\
-                    (source_field not in source_required_fields):
-
-                source_required_fields.append(source_field)
-
-        return source_required_fields
-
-    @property
-    def _default_values(self):
-        '''
-        '''
-        default_values = {}
-
-        target_default_values = self._schema_parser.get_default_values()
-        source_default_values = self._mapping_parser.get_default_values()
-
-        for source_field, target_field in self._field_mapping.items():
-
-            if source_field not in source_default_values:
-                continue
-
-            elif target_field not in target_default_values:
-
-                target_default_values[target_field] = np.nan
-
-            default_values[source_field] = {
-                    target_default_values[target_field]:
-                    source_default_values[source_field]
-                    }
-
-        return default_values
-
-    @property
-    def _python_types(self):
-        '''
-        '''
-        target_types = self._schema_parser.get_python_types()
-
-        result = {}
-
-        for source_field, target_field in self._field_mapping.items():
-
-            if target_field in target_types:
-                result[source_field] = target_types[target_field]
-
-            """
-            date_type_mismatch =\
-                    (target_field in target_types) and\
-                    (source_field in source_types) and\
-                    (target_types[target_field] == str) and\
-                    (source_types[source_field] == np.dtype('<M8[ns]'))
-
-            if date_type_mismatch:
-                target_types[target_field] = np.dtype('<M8[ns]')
-
-            if (source_field in source_types) and\
-                    (target_field in target_types) and\
-                    (target_types[target_field] != source_types[source_field]):
-
-                self.log_and_raise(("Type {0} of field {1} "
-                                    "in schema does not match "
-                                    "type {2} of field {3} in "
-                                    "migration mapping")
-                                   .format(target_types[target_field],
-                                           target_field,
-                                           source_types[source_field],
-                                           source_field))
-
-            if target_field in target_types:
-                source_types[source_field] = target_types[target_field]
-
-            """
-
-        return result
-
-    @property
-    def _value_mappings(self):
-        '''
-        '''
-        return self._mapping_parser.get_value_mappings()
-
-    @property
-    def _date_formats(self):
-        '''
-        '''
-        return self._mapping_parser.get_date_formats()
-
-    def _get_mongo_schema_info(self, method_name: str):
-        '''
-        '''
-        result = {}
-
-        target_dict = getattr(self._schema_parser, method_name)()
-
-        for source_field, target_field in self._field_mapping.items():
-
-            if target_field in target_dict:
-
-                result[source_field] = target_dict[target_field]
-
-        return result
-
-    @property
-    def _allowed_values(self):
-        '''
-        '''
-        return self._get_mongo_schema_info("get_allowed_values")
-
-    @property
-    def _minimum_values(self):
-        '''
-        '''
-        return self._get_mongo_schema_info("get_minimum_value")
-
-    @property
-    def _maximum_values(self):
-        '''
-        '''
-        return self._get_mongo_schema_info("get_maximum_value")
-
-    @property
-    def _patterns(self):
-        '''
-        '''
-        return self._get_mongo_schema_info("get_patterns")
-
-    def _filter_invalid_data(self, data: pd.DataFrame,
-                             invalid_mask: pd.Series,
-                             reason: (str, pd.Series)) -> pd.DataFrame:
-        '''
-        '''
-        from libraries.db_handlers.SQLHandler import SQLHandler
-
-        assert((self._inconsist_report_table is not None) and
-               (self._filter_index_columns is not None)),\
-            "Inconsistent report table or filter index is not provided"
-
-        self._assert_dataframe_input(data)
-
-        data = data.copy(deep=True)
-
-        db = SQLHandler()
-
-        if invalid_mask.sum() == 0:
-
-            return data
-
-        data_inconsist = data.assign(reason=reason)\
-                             .loc[invalid_mask]\
-                             .reset_index(drop=True)
-
-        db.append_to_table(data=data_inconsist,
-                           tablename=self._inconsist_report_table)
-
-        n_rows_filtered = len(data_inconsist)
-        n_instances_filtered = len(data_inconsist[self._filter_index_columns].drop_duplicates())
-
-        del data_inconsist
-        gc.collect()
-
-        self._log.warning(("Filtering: {0} ."
-                           "Filtered {1} rows "
-                           "and {2} instances"
-                           .format(reason, n_rows_filtered, n_instances_filtered)))
-
-        nok_index_data = data.loc[invalid_mask, self._filter_index_columns]\
-                             .drop_duplicates().reset_index(drop=True)
-
-        nok_index = pd.MultiIndex.from_arrays([nok_index_data[c] for c in
-                                               self._filter_index_columns])
-
-        all_index = pd.MultiIndex.from_arrays([data[c] for c in
-                                               self._filter_index_columns])
-
-        data = data.loc[~all_index.isin(nok_index)].reset_index(drop=True)
-
-        return data
-
-    def _replace_values(self, data: pd.DataFrame,
-                        default: bool) -> pd.DataFrame:
-        '''
-        '''
-        if default:
-            default_str = "default"
-        else:
-            default_str = "equal"
-
-        self._assert_dataframe_input(data)
-
-        data = data.copy(deep=True)
-
-        if default:
-            mapping = self._default_values
-        else:
-            mapping = self._value_mappings
-
-        for column, d in mapping.items():
-
-            try:
-
-                if column not in data.columns:
-                    continue
-
-                dtype = data[column].dtype
-
-                for key, values in d.items():
-
-                    if not default:
-
-                        mask = (data[column].astype(str).isin(values))
-
-                    else:
-                        mask = (data[column].isin(values))
-
-                    if default:
-
-                        mask = mask | (data[column].isnull())
-
-                    data.loc[mask, column] = key
-
-                data[column] = data[column].astype(dtype)
-
-            except Exception as e:
-
-                self.log_and_raise(("Failed to replace {0} values "
-                                    "in {1}. Exit with error {2}"
-                                    .format(default_str, column, e)))
-
-        self._log.info("Replaced {} values".format(default_str))
-
-        return data
-
-    def replace_default_values(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        return self._replace_values(data=data, default=True)
-
-    def map_equal_values(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        return self._replace_values(data=data, default=False)
-
-    def convert_types(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        self._assert_dataframe_input(data)
-
-        for column, python_type in self._python_types.items():
-
-            try:
-                if column not in data.columns:
-                    continue
-
-                elif column in self._date_formats:
-
-                    data[column] = CleaningUtils.convert_dates(
-                            series=data[column],
-                            formats=self._date_formats[column])
-
-                elif (python_type == int) and data[column].isnull().any():
-
-                    self.log_and_raise(("Column {} contains missing values "
-                                        "and cannot be of integer type"
-                                        .format(column)))
-
-                elif python_type == str:
-
-                    python_type = object
-
-                else:
-
-                    data[column] = data[column].astype(python_type)
-
-                if data[column].dtype != python_type:
-
-                    self._log.warning(("After conversion type in {0} "
-                                       "should be {1} "
-                                       "but is still {2}"
-                                       .format(column,
-                                               python_type,
-                                               data[column].dtype)))
-
-            except Exception as e:
-
-                self.log_and_raise(("Failed to convert types in {0}. "
-                                    "Exit with error {1}"
-                                    .format(column, e)))
-
-        self._log.info("Converted dtypes")
-
-        return data
-
-    def filter_invalid_null_values(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        self._assert_dataframe_input(data)
-
-        for column in data.columns:
-
-            if (column in self._required_fields) and\
-                    (data[column].isnull().any()):
-
-                invalid_mask = data[column].isnull()
-
-                reason = "Null value in the required field {}"\
-                         .format(column)
-
-                data = self._filter_invalid_data(data=data,
-                                                 invalid_mask=invalid_mask,
-                                                 reason=reason)
-
-        return data
-
-    def filter_invalid_types(self, data: pd.DataFrame) -> pd.DataFrame():
-        '''
-        '''
-        self._assert_dataframe_input(data)
-
-        for column, python_type in self._python_types.items():
-
-            if data[column].dtype != python_type:
-
-                def mismatch_type(x):
-                    return type(x) != python_type
-
-                invalid_mask = data[column].apply(mismatch_type)
-
-                reason = "Type mismatch if field {}".format(column)
-
-                data = self._filter_invalid_data(data=data,
-                                                 invalid_mask=invalid_mask,
-                                                 reason=reason)
-
-        return data
-
-    def filter_invalid_patterns(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        self._assert_dataframe_input(data)
-
-        for column, pattern in self._patterns:
-
-            invalid_mask = (~data[column].astype(str).str.match(pattern))
-
-            reason = "Pattern mismatch in field {}".format(column)
-
-            data = self._filter_invalid_data(data=data,
-                                             invalid_mask=invalid_mask,
-                                             reason=reason)
-
-        return data
-
-    def filter_notallowed_values(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        for column, value in self._minimum_values.items():
-
-            invalid_mask = data[column] > value
-
-            reason = "Too large values in field {}".format(column)
-
-            data = self._filter_invalid_data(data=data,
-                                             invalid_mask=invalid_mask,
-                                             reason=reason)
-
-        for column, value in self._maximum_values.items():
-
-            invalid_mask = data[column] < value
-
-            reason = "Too small values in field {}".format(column)
-
-            data = self._filter_invalid_data(data=data,
-                                             invalid_mask=invalid_mask,
-                                             reason=reason)
-
-        for column, allowed_values in self._allowed_values.items():
-
-            invalid_mask = (~data[column].isin(allowed_values))
-
-            reason = "Too small values in field {}".format(column)
-
-            data = self._filter_invalid_data(data=data,
-                                             invalid_mask=invalid_mask,
-                                             reason=reason)
-
-        return data
-
-
-if __name__ == "__main__":
-
-    # testing
-
-    from libraries.db_handlers.SQLHandler import SQLHandler
-
-    mapping_path = os.path.join(".", "migration_mappings", "rs1_mapping.json")
-
-    schema_paths = [
-            os.path.join(".", "mongo_schema", "schema_wheelsets.json"),
-            os.path.join(".", "mongo_schema", "schema_process_instances.json")]
-
-    inconsist_report_table = "test_inconsist_report_rs1"
-
-    if all([os.path.isfile(p) for p in schema_paths + [mapping_path]]):
-
-        print("Found schemas!")
-
-        cleaner = MigrationCleaning(
-                mapping_path=mapping_path,
-                schema_paths=schema_paths,
-                mapping_source="internal_name",
-                mapping_target="mongo_name",
-                filter_index_columns=["radsatznummer"],
-                inconsist_report_table=inconsist_report_table)
-
-        db = SQLHandler()
-
-        data = db.read_sql_to_dataframe("select * from rs1 limit 100")
-
-        data = cleaner.replace_default_values(data)
-
-        data = cleaner.map_equal_values(data)
-
-        data = cleaner.convert_types(data)
-
-        non_filtered_len = len(data)
-
-        data = cleaner.filter_invalid_types(data)
-
-        if len(data) < non_filtered_len:
-
-            data = cleaner.convert_types(data)
-
-        data = cleaner.filter_invalid_null_values(data)
-
-        data = cleaner.filter_invalid_patterns(data)
-
-        data = cleaner.filter_notallowed_values(data)
-
-    print("Done!")

+ 0 - 62
db_migration/ParseDbSchema.py

@@ -1,62 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Wed Sep 25 08:22:20 2019
-
-@author: tanya
-"""
-
-import os
-import sys
-import abc
-sys.path.append(os.getcwd())
-
-
-class ParseDbSchema(metaclass=abc.ABCMeta):
-    '''
-    '''
-    def __init__(self, schema_paths: [list, str], log_file: str = None):
-        '''
-        '''
-        from libraries.log import Log
-
-        self._log = Log(name="ParseDbSchema:", log_file=log_file)
-
-        if isinstance(schema_paths, str):
-            schema_paths = [schema_paths]
-
-        for schema_path in schema_paths:
-            if not os.path.isfile(schema_path):
-                err = "Schema not found"
-                self._log.error(err)
-                raise FileNotFoundError(err)
-
-    @abc.abstractmethod
-    def get_fields(self) -> list:
-        '''
-        '''
-        return
-
-    @abc.abstractmethod
-    def get_datetime_fields(self) -> list:
-        '''
-        '''
-        return
-
-    @abc.abstractmethod
-    def get_python_types(self) -> list:
-        '''
-        '''
-        return
-
-    @abc.abstractmethod
-    def get_default_values(self) -> list:
-        '''
-        '''
-        return
-
-    @abc.abstractmethod
-    def get_allowed_values(self) -> list:
-        '''
-        '''
-        return

+ 0 - 332
db_migration/ParseJsonSchema.py

@@ -1,332 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Thu Jan 31 11:41:48 2019
-
-@author: tanya
-"""
-
-import os
-import sys
-from copy import deepcopy
-import numpy as np
-
-sys.path.append(os.getcwd())
-
-from libraries.db_migration.ParseDbSchema import ParseDbSchema
-
-
-class ParseJsonSchema(ParseDbSchema):
-    '''
-    Class for retrieving column properties from mongodb jsonSchema
-    '''
-
-    def __init__(self, schema_paths: [list, str], log_file: str = None):
-        '''
-        '''
-        import json
-        from libraries.log import Log
-
-        super().__init__(schema_paths=schema_paths, log_file=log_file)
-
-        self._log = Log(name="ParseJsonSchema", log_file=log_file)
-
-        # load schemas to dictionaries if they are valid json files
-
-        assert(isinstance(schema_paths, (list, str))),\
-            "Schema paths must be either str or lists"
-
-        if isinstance(schema_paths, str):
-            schema_paths = [schema_paths]
-
-        self.schemas = []
-
-        for schema_path in schema_paths:
-            try:
-                with open(schema_path, "r") as f:
-                    self.schemas.append(json.load(f))
-
-            except Exception as e:
-                err = ("Could not load json schema, "
-                       "Obtained error {}".format(e))
-
-                self._log.error(err)
-                raise Exception(err)
-
-    def get_fields(self) -> list:
-        '''
-        '''
-        return self._parse()
-
-    def get_required_fields(self) -> list:
-        '''
-        '''
-        return self._parse(required_only=True)
-
-    def get_mongo_types(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="bsonType")
-
-    def get_datetime_fields(self):
-        '''
-        '''
-        mongo_types = self.get_mongo_types()
-
-        return [k for k, v in mongo_types.items()
-                if v in ["date", "timestamp", "Date", "Timestamp"]]
-
-    def get_python_types(self) -> dict:
-        '''
-        '''
-        mongo_types = self.get_mongo_types()
-        python_types = {}
-
-        bson_to_python_types_except_dates = {"double": float,
-                                             "decimal": float,
-                                             "string": str,
-                                             "object": object,
-                                             "array": list,
-                                             "bool": bool,
-                                             "int": int,
-                                             "long": int,
-                                             "date": np.dtype('<M8[ns]'),
-                                             "timestamp": np.dtype('<M8[ns]')
-                                             }
-
-        for k, v in mongo_types.items():
-
-            if isinstance(v, list):
-                if ("date" in v) or ("timestamp" in v):
-                    v = "date"
-                elif "string" in v:
-                    v = "string"
-                elif ("double" in v) or ("decimal" in v):
-                    v = "double"
-                elif ("null" in v) and (len(v) == 2) and ("int" not in v):
-                    v = [t for t in v if type != "null"][0]
-                else:
-                    err = "Type {0}: {1} not convertibale".format(k, v)
-                    self._log.error(err)
-                    raise Exception(err)
-
-            if v in bson_to_python_types_except_dates:
-                python_types[k] = bson_to_python_types_except_dates[v]
-
-        return python_types
-
-    def get_patterns(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="pattern")
-
-    def get_default_values(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="default")
-
-    def get_allowed_values(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="enum")
-
-    def get_maximum_value(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="maximum")
-
-    def get_minimum_value(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="minimum")
-
-    def get_max_items(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="maxItems")
-
-    def get_min_items(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="minItems")
-
-    def get_field_descriptions(self) -> dict:
-        '''
-        '''
-        return self._parse(field_info="description")
-
-    def _parse(self,
-               field_info: str = None,
-               required_only: bool = False):
-        '''
-        '''
-        result = self._parse_one(schema=self.schemas[0],
-                                 field_info=field_info,
-                                 required_only=required_only)
-
-        for schema in self.schemas[1:]:
-
-            next_result = self._parse_one(schema=schema,
-                                          field_info=field_info,
-                                          required_only=required_only)
-
-            if isinstance(result, list):
-                result.extend(next_result)
-            else:
-                result.update(next_result)
-
-        return result
-
-    def _parse_one(self,
-                   schema: dict,
-                   field_info: str = None,
-                   required_only: bool = False,
-                   super_field_name: str = None,
-                   already_parsed: (list, dict) = None) -> (list, dict):
-        '''
-        Recursive function that returns a list of (nested) field names or
-        a dictionary of (nested) field names with field characteristics.
-
-        :param schema: if None => entire self.schema, or a sub-schema
-            of self.schema
-
-        :param field_info: optional, if provided a dictionary of field
-            names with field characteristics is returned (for examples
-            bsonType of each field), else a list of fields is returned
-
-        :param required_only: when True, only returns fields marked as
-            required in the mongo schema
-
-        :param super_field_name: needed for recursion
-            Example: the field 'article' has
-            subfields 'id' and 'supplier'.
-            If we parse the sub-document corresponding to article, then
-            super_field_name is'article' and we might get an output like
-            {'article.id': string, 'article.supplier': string}
-
-        :param alread_parsed: needed for recursion
-
-        '''
-        schema = deepcopy(schema)
-
-        assert(isinstance(schema, dict)),\
-            "Parameter 'schema' must be a dict"
-
-        if field_info is None:
-            # parse a list of fields
-            if already_parsed is None:
-                already_parsed = []
-            else:
-                assert(isinstance(already_parsed, list)),\
-                    "Parameter 'already_parsed' must be of type list"
-        else:
-            # parse a dictionary of field names with field characteristics
-            if already_parsed is None:
-                already_parsed = {}
-            else:
-                assert(isinstance(already_parsed, dict)),\
-                    "Parameter 'already_parsed' must be of type dict"
-
-        # If schema is nested, then
-        # either it is of bsonType object
-        # and the field information is stored under the key 'properties'
-        # or it is of bsonType array
-        # and the field information is stored in sub-schemas
-        # under the key 'items'
-
-        # if schema is of bsonType object
-        if "properties" in schema.keys():
-            if "required" in schema.keys():
-                required_subfields = schema["required"]
-
-            for sub_field_name in schema["properties"].keys():
-
-                sub_schema = schema["properties"][sub_field_name]
-
-                # only process fields that are required
-                if required_only and\
-                        (sub_field_name not in required_subfields):
-                    pass
-                else:
-                    if super_field_name is not None:
-                        field_name = '.'.join([super_field_name,
-                                               sub_field_name])
-                    else:
-                        field_name = sub_field_name
-
-                    # if the given sub-field is nested, parse the
-                    # sub-schema corresponding to this sub-field
-                    self._parse_one(
-                            schema=sub_schema,
-                            super_field_name=field_name,
-                            field_info=field_info,
-                            already_parsed=already_parsed,
-                            required_only=required_only)
-
-        # if schema is of bsonType array
-        elif "items" in schema.keys():
-            # one schema for all items
-            if isinstance(schema["items"], dict):
-
-                sub_schema = schema["items"]
-
-                self._parse_one(schema=sub_schema,
-                                super_field_name=super_field_name,
-                                field_info=field_info,
-                                already_parsed=already_parsed,
-                                required_only=required_only)
-
-            # list of separate schemas for each item
-            elif isinstance(schema["items"], list):
-
-                for sub_schema in schema["items"]:
-                    self._parse_one(schema=sub_schema,
-                                    super_field_name=super_field_name,
-                                    field_info=field_info,
-                                    already_parsed=already_parsed,
-                                    required_only=required_only)
-            else:
-                raise Exception(('Schema is not composed correctly: '
-                                 'items must be a dictionary or a list'))
-        else:
-            # If neither properties nor items is in schema keys
-            # we reached the last level of nestedness,
-            # field information is stored in the schema keys.
-            field_name = super_field_name
-
-            if field_info is None:
-                already_parsed.append(field_name)
-            else:
-                if field_info in schema.keys():
-                    already_parsed[field_name] = schema[field_info]
-                else:
-                    pass
-
-        return already_parsed
-
-
-if __name__ == "__main__":
-
-    # Only for testing
-
-    schema_path = os.path.join(".", "mongo_schema", "schema_wheelsets.json")
-
-    if os.path.isfile(schema_path):
-
-        parse_obj = ParseJsonSchema(schema_paths=schema_path)
-
-        fields = parse_obj.get_fields()
-
-        required_fileds = parse_obj.get_required_fields()
-
-        patterns = parse_obj.get_patterns()
-
-        mongo_types = parse_obj.get_mongo_types()
-
-        python_types_except_dates = parse_obj.get_python_types()
-
-        datetime_fields = parse_obj.get_datetime_fields()
-
-        allowed_values = parse_obj.get_allowed_values()
-
-        descriptions = parse_obj.get_field_descriptions()

+ 0 - 157
db_migration/ParseMapping.py

@@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Sep 20 15:33:17 2019
-
-@author: tanya
-"""
-
-import os
-import sys
-import numpy as np
-sys.path.append(os.getcwd())
-
-
-class ParseMapping:
-    '''
-    '''
-    def __init__(self, mapping_path: str, log_name: str = "ParseMapping",
-                 source: str = "original_name", target: str = "original_name"):
-        '''
-        '''
-        import json
-        from libraries.log import Log
-
-        self._log = Log(log_name)
-
-        if not os.path.isfile(mapping_path):
-            err = "Mapping not found"
-            self._log.error(err)
-            raise FileNotFoundError(err)
-
-        try:
-            with open(mapping_path, "r") as f:
-                self._mapping = json.load(f)
-
-        except Exception as e:
-            err = ("Could not load mapping. "
-                   "Exit with error {}".format(e))
-            self._log.error(err)
-            raise Exception(err)
-
-        self._source = source
-        self._target = target
-
-    def get_field_mapping(self) -> dict:
-        '''
-        '''
-        assert(all([set([self._source, self._target]) <= set(d)
-                    for d in self._mapping]))
-
-        return {d[self._source]: d[self._target] for d in self._mapping}
-
-    def _get_fields_satistisfying_condition(self, key: str, value) -> list:
-        '''
-        '''
-        assert(all([self._source in d for d in self._mapping])),\
-            "Invalid from field"
-
-        return [d[self._source] for d in self._mapping
-                if (key in d) and (d[key] == value)]
-
-    def get_required_fields(self) -> list:
-        '''
-        '''
-        return self._get_fields_satistisfying_condition(key="required",
-                                                        value=1)
-
-    def get_date_fields(self) -> list:
-        '''
-        '''
-        return self._get_fields_satistisfying_condition(key="type",
-                                                        value="Date")
-
-    def _get_info(self, key: str, value=None) -> dict:
-        '''
-        '''
-        assert(all([self._source in d for d in self._mapping])),\
-            "Invalid from field"
-
-        return {d[self._source]: d[key] for d in self._mapping
-                if (key in d) and ((value is not None)
-                and (d[key] == value)) or (key in d)}
-
-    def get_default_values(self) -> dict:
-        '''
-        '''
-        return self._get_info(key="default_values")
-
-    def get_date_formats(self) -> dict:
-        '''
-        '''
-        return self._get_info(key="date_format")
-
-    def get_types(self) -> dict:
-        '''
-        '''
-        return self._get_info(key="type")
-
-    def get_python_types(self) -> dict:
-        '''
-        '''
-        sql_to_python_dtypes = {
-                "Text": str,
-                "Date": np.dtype('<M8[ns]'),
-                "Double": float,
-                "Integer": int
-                }
-
-        sql_types = self.get_types()
-
-        return {k: sql_to_python_dtypes[v] for k, v in sql_types.items()}
-
-    def get_value_mappings(self) -> dict:
-        '''
-        '''
-        return self._get_info(key="value_mapping")
-
-    def get_column_numbers(self) -> list:
-        '''
-        '''
-        if all(["column_number" in d for d in self._mapping]):
-            column_numbers = [d["column_number"] for d in self._mapping]
-
-        elif all(["column_number" not in d for d in self._mapping]):
-            column_numbers = list(range(len(self._mapping)))
-
-        else:
-            err = ("Incorrectly filled mapping. Column numbers should ",
-                   "either in all or in neither of the fields")
-            self.log.err(err)
-            raise Exception(err)
-
-        return column_numbers
-
-
-if __name__ == "__main__":
-
-    mapping_path = os.path.join(".", "migration_mappings", "rs0_mapping.json")
-
-    if os.path.isfile(mapping_path):
-
-        print("found mapping path")
-
-        parser = ParseMapping(mapping_path, source="internal_name",
-                              target="mongo_name")
-
-        internal_to_mongo_mapping = parser.get_field_mapping()
-
-        original_to_internal_mapping = parser.get_field_mapping()
-
-        default_values = parser.get_default_values()
-
-        types = parser.get_types()
-
-        column_numbers = parser.get_column_numbers()
-
-        print("Done testing!")

BIN
db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc


BIN
db_migration/__pycache__/MigrationCleaning.cpython-37.pyc


BIN
db_migration/__pycache__/ParseDbSchema.cpython-37.pyc


BIN
db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc


BIN
db_migration/__pycache__/ParseMapping.cpython-37.pyc


+ 0 - 798
hyperopt/HyperoptPipelineSelection.py

@@ -1,798 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Nov  9 13:27:44 2018
-
-@author: tanja
-@description: Implementation of machine learning
-                pipeline selection and tuning with hyperopt library
-"""
-
-import os
-import sys
-import gc
-import logging
-import pickle
-import time
-import datetime
-
-import pandas as pd
-import numpy as np
-
-from sklearn.pipeline import Pipeline
-
-from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
-    space_eval, pyll
-
-from sklearn.model_selection import cross_validate
-
-
-class HyperoptPipelineSelection:
-    '''
-    Use this class to perform a search
-    for a machine learning pipeline in a given parameter space.
-    The parameter space can include multiple types of Pipelines
-    (SVM, XGBOOST, random forest, etc),
-    as well as parameter distributions for each pipeline parameter.
-    See example in main for the expected space structure.
-
-    The search can be performed either randomly
-    or with a tree-based algorithm. (Other methods are currently
-    developped by hyperopt creators).
-
-    Attribute trials is responsible for book-keeping parameter
-    combinations that have already been tried out. This attribute
-    is saved to a binary file every n minutes as well as every time
-    a better pipeline was found.
-    '''
-    def __init__(self,
-                 cost_func,
-                 greater_is_better: bool,
-                 trials_path: str,
-                 backup_trials_freq: int = 1,
-                 log_path: str = None,
-                 averaging_func: callable = None):
-        '''
-        :param callable cost_func: function to minimize or maximize
-
-        :param bool greater_is_better: when True
-            cost_func is maximized, else minimized.
-
-        :param str trials_path: path at which the trials object is saved
-            in binary format. From the trials object we can
-            select information about the obtained scores, score variations,
-            and pipelines, and parameters tried out so far. If a trials object
-            already exists at the given path, it is loaded and the
-            search is continued, else, the search is started from
-            the beginning.
-
-        :param backup_trials_freq: frequecy in interations (trials)
-            of saving the trials object at the trials_path.
-
-        :param str log_path: Optional, when not provided logs to stdout.
-
-        :param callable averaging_func: optional,
-            when not provided set to mean. Function
-            to aggregate the cross-validated values of the cost function.
-            Classic situation is to take the mean,
-            another example is, for example mean() - c*var().
-        '''
-
-        assert(callable(cost_func)),\
-            "Parameter 'cost_func' must be a callable"
-
-        assert(isinstance(greater_is_better, bool)),\
-            "Parameter 'greater_is_better' must be bool type"
-
-        assert(isinstance(trials_path, str)),\
-            "Parameter 'trials_path' must be of string type"
-
-        if averaging_func is not None:
-            assert(callable(averaging_func)),\
-                "Parameter 'averaging_func' must be a callable"
-
-        self._assert_valid_directory(path=trials_path)
-
-        self._configer_logger(log_path)
-
-        self._cost_func = cost_func
-        # is 1 when cost_func is minimized, -1 when cost func is maximized
-        self._score_factor = (not greater_is_better) - greater_is_better
-        self._trials_path = trials_path
-        # is initialized with empty trials object
-        self._trials = Trials()
-        self._backup_trials_freq = backup_trials_freq
-        self._averaging_func = averaging_func or np.mean
-        # keeping track of the current search iteration
-        self._run_number = 0
-        # space and data need to be attached to perform search.
-        self._space_attached = False
-        self._data_attached = False
-
-        # if a trials object already exists at the given path,
-        # it is loaded and the search is continued. Else,
-        # the search is started from the beginning.
-        if os.path.isfile(trials_path):
-            try:
-                with open(trials_path, "rb") as f:
-                    self._trials = pickle.load(f)
-
-                self._logger.info(("Loaded an existing trials object"
-                                   "Consisting of {} trials")
-                                  .format(len(self._trials.trials)))
-
-            except Exception as e:
-                self._logger.error(("Trials object could not be loaded. "
-                                    "Training starts from the beginning. "
-                                    "Exit with error {}").format(e))
-
-        else:
-            self._logger.info(("No existing trials object was found"
-                               "Initialized an empty trials object."))
-
-        self._best_score = self.best_trial_score
-
-    def _configer_logger(self, log_path: str = None):
-        '''
-        Can be replaced with the existing script later.
-        When log_path is not provided, logs to stdout.
-        '''
-
-        self._logger = logging.getLogger(__name__)
-
-        if (self._logger.hasHandlers()):
-            self._logger.handlers.clear()
-
-        if log_path is not None:
-            assert(isinstance(log_path, str)),\
-                "Parameter 'log_path' must be of string type"
-            self._assert_valid_directory(log_path)
-
-            handler = logging.FileHandler(log_path)
-        else:
-            handler = logging.StreamHandler(sys.stdout)
-
-        formatter = logging.Formatter(
-                '\n %(asctime)s %(levelname)s %(message)s')
-
-        handler.setFormatter(formatter)
-        self._logger.addHandler(handler)
-        self._logger.setLevel("INFO")
-
-    def _backup_trials(self):
-        '''
-        Pickles (Saves) the trials object.
-        Used in a scheduler.
-        '''
-        with open(self._trials_path, "wb") as f:
-            pickle.dump(self._trials, f)
-
-    def _assert_valid_directory(self, path: str):
-        '''
-        If the directory of a path does not exist yet,
-        creates it.
-        '''
-        assert(isinstance(path, str)),\
-            "Parameter 'path' must of str type"
-
-        dirname = os.path.dirname("path")
-
-        if len(dirname) > 0:
-            os.mkdir(dirname, exists_ok=True)
-
-    def attach_space(self, space: pyll.base.Apply = None,
-                     module_path: str = None,
-                     name: str = None):
-        '''
-        :param pyll.base.Apply space: hyperopt space where
-            the search is performed. Optional when a space
-            is loaded from a python module.
-
-        :param str module_path: path to python module
-            where the space is defined. Optional when
-            the space is provided directly.
-
-        :param str name: name of the space loaded from
-            a python module. Optional when the space
-            is provided directly.
-        '''
-        assert((space is not None) or
-               ((module_path is not None) and (name is not None))),\
-            "Either space or (module_path, name) must be provided"
-
-        if space is None:
-            for p in ["modele_path", "name"]:
-                assert(isinstance(p, str)),\
-                    "Parameter '{}' must be of str type".format(p)
-
-            assert(os.path.isfile(module_path)),\
-                "Parameter 'module_path' must be a valid file"
-
-            module, extension = os.path.splitext(os.path.basename(module_path))
-            assert(extension == ",py"),\
-                "Parameter 'space' must be read from a python file"
-
-            sys.path.insert(module_path)
-
-            try:
-                from module import name as space
-            except ImportError:
-                err = "Invalid space location or name"
-                self._logger.error(err)
-                raise Exception(err)
-
-        assert(isinstance(space, pyll.base.Apply)),\
-            "Parameter 'space' must be of hyperopt space type"
-
-        self._space = space
-        self._logger.info("Attached parameter distribution space")
-        self._space_attached = True
-
-    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
-            -> np.ndarray:
-        '''
-        Converts an DataFrame to an numpy array.
-        '''
-        if isinstance(x, np.ndarray):
-            return x
-
-        elif (isinstance(x, pd.core.frame.DataFrame))\
-                or (isinstance(x, pd.core.series.Series)):
-            return x.values
-
-        else:
-            e = 'The argument must be a numpy array or a pandas DataFrame'
-            self._logger.critical(e)
-            raise ValueError(e)
-
-    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
-                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    X_val: (pd.DataFrame, np.ndarray) = None,
-                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    cv: (list, int) = None):
-        '''
-        :param array X_train: data on which
-            machine learning pipelines are trained
-
-        :param array y_train: optional, vector with targets,
-            (not all algorithms require a targets)
-
-        :param array X_val: optional, validation data.
-            When not provided, cross-validated value
-            of the cost_func is calculated.
-
-        :param array y_val: optional, validation targets
-
-        :param list cv: list of tuples containing
-            train and validation indices or an integer representing
-            the number of folds for a random split of data
-            during cross-validation
-            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
-        '''
-
-        X_train = self._convert_to_array(X_train)
-        if y_train is not None:
-            y_train = self._convert_to_array(y_train)
-
-        if X_val is not None:
-            if cv is not None:
-                self._logger.warning(("Both validation set and cv object "
-                                      "are set. Validation score will be "
-                                      "calculated on the validation set!"))
-
-            X_val = self._convert_to_array(X_val)
-
-            train_inds = list(range(len(X_train)))
-            val_inds = list(range(len(X_train),
-                                  len(X_train) + len(X_val)))
-
-            # cost is evaluated with a cross validation function
-            # that accepts an array and a cv object with
-            # indices of the fold splits.
-            # Here we create a trivial cv object
-            # with one validation split.
-            self._cv = [(train_inds, val_inds)]
-            self._X = np.concatenate([X_train, X_val])
-
-            if y_train is not None:
-                if y_val is None:
-                    err = "Argument y_val must be provided"
-                    self._logger.critical(err)
-                    raise ValueError(err)
-                else:
-                    y_val = self._convert_to_array(y_val)
-                    self._y = np.concatenate([y_train, y_val])
-            else:
-                self._y = None
-        else:
-            if cv is None:
-                self._logger.warning(("Neither validation set nor cv object "
-                                      "are set. Validation score will be "
-                                      "calculated on 5 randomly "
-                                      "splitted folds."))
-
-            self._X = X_train
-            self._y = y_train
-            self._cv = cv
-
-        self._logger.info("Attached data")
-        self._data_attached = True
-
-    def _evaluate(self, pipeline: Pipeline) -> dict:
-        '''
-        This method is called in _objective.
-
-        Calculates the cost on the attached data.
-        This function can be overriden, when the cost
-        needs to be calculated differently,
-        for example with a tensorflow model.
-
-        :param Pipeline pipeline: machine learning pipeline
-            that will be evaluated with cross-validation
-
-        :output: dictionary with the aggregated
-            cross-validation score and
-            the score variance.
-        '''
-
-        scores = cross_validate(estimator=pipeline,
-                                X=self._X,
-                                y=self._y,
-                                cv=self._cv or 5,
-                                scoring=make_scorer(self._cost_func),
-                                error_score=np.nan)
-
-        return {'value': self._averaging_func(scores['test_score']),
-                'variance': np.var(scores['test_score'])}
-
-    def _objective(self, space_element: dict) -> dict:
-        '''
-        This method is called in search_for_best_pipeline
-        inside the hyperopt fmin method.
-
-        Uses _evaluate method.
-
-        It must take as input a space element
-        and produce an output in the form of dictionary
-        with 2 obligatory values loss and status
-        (STATUS_OK or STATUS_FAIL). Other
-        values in the output are optional and can be
-        accessed later through the trials object.
-
-        :Warning: fmin minimizes the loss,
-        when _evaluate returns a value to be maximized,
-        it should be multiplied by -1 to obtain loss.
-
-        :param dict space_element: must contain keys
-            name (with the name of the pipeline),
-            pipeline (Pipeline object),
-            params (dict of pipeline params)
-
-        :output: dictionary with keys
-            loss (minimized value),
-            status with values STATUS_OK or STATUS_FAIL
-            uderstood by hyperopt,
-            score (equal to loss or -loss),
-            score_variance,
-            timestamp (end of execution),
-            train_time: execution time
-        '''
-        assert(isinstance(space_element, dict) and
-               set(['name', 'pipeline', 'params']) <= space_element.keys())
-
-        assert(isinstance(space_element['name'], str) and
-               isinstance(space_element['pipeline'], Pipeline) and
-               isinstance(space_element['params'], dict))
-
-        start_time = time.time()
-
-        if not self._data_attached:
-            raise Exception(("Data must be attached in order "
-                             "in order to effectuate the best"
-                             "pipeline search"))
-
-        self._run_number += 1
-
-        pipeline = space_element['pipeline']
-        params = space_element['params']
-        pipeline.set_params(**params)
-
-        self._logger.info(("Run number {0}: "
-                           "Current score is {1}: "
-                           "Training pipeline {2} "
-                           "with parameters: {3}. ").format(
-                             self._run_number,
-                             self._best_score,
-                             space_element['name'],
-                             params))
-
-        try:
-            score_stats = self._evaluate(pipeline)
-            assert(not np.isnan(score_stats["value"])),\
-                "Returned null score"
-
-            if self._run_number % self._backup_trials_freq == 0:
-                self._backup_trials()
-
-            if (self._best_score != self._best_score) or\
-                self._score_factor*score_stats["value"] <\
-                    self._score_factor*self._best_score:
-
-                self._logger.info("Score got better, new best score is: {}"
-                                  .format(score_stats["value"]))
-
-                self._best_score = score_stats['value']
-
-                self._backup_trials()
-
-            end_time = time.time()
-
-            return {'loss': self._score_factor * score_stats["value"],
-                    'status': STATUS_OK,
-                    'score': score_stats["value"],
-                    'score_variance': score_stats["variance"],
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': end_time - start_time}
-
-        except Exception as e:
-
-            self._logger.warning("Trial failed with error {}".format(e))
-
-            return {'loss': np.nan,
-                    'status': STATUS_FAIL,
-                    'score': np.nan,
-                    'score_variance': np.nan,
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': np.nan}
-
-    def search_for_best_pipeline(self,
-                                 niter: int,
-                                 algo: callable = tpe.suggest):
-        '''
-        Method performing the search of the best pipeline in the given space.
-        Calls fmin function from the hyperopt library to minimize the output of
-        _objective.
-
-        :params int niter: number of search iterations
-        :param callable algo: now can only take values tpe for a tree-based
-            random search or random for random search
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        assert(isinstance(niter, int)),\
-            "Parameter 'niter' must be of int type"
-
-        # right now only two algorithms are provided by
-        assert(algo in [tpe.suggest, rand.suggest]),\
-            ("Parameter 'algo' can be now only tpe or random. "
-             "If other algorithms have been developped by "
-             "by hyperopt, plased add them to the list.")
-
-        try:
-            self._logger.info(("Starting {0} iterations of search "
-                               "additional to {1} previous"
-                               .format(niter, len(self._trials.trials))))
-
-            best = fmin(fn=self._objective,
-                        space=space,
-                        algo=algo,
-                        trials=self._trials,
-                        max_evals=len(self._trials.trials) + niter)
-
-            # print('AAAA', str(niter))
-
-            self._logger.info(
-                    "Best score is {0} with variance {1}"
-                    .format(
-                     self._trials.best_trial["result"]["score"],
-                     self._trials.best_trial["result"]["score_variance"]))
-
-            self._logger.info(("Finished {0} iterations of search.\n"
-                               "Best parameters are:\n {1} ")
-                              .format(niter,
-                                      space_eval(space, best)))
-
-            self._backup_trials()
-
-        except Exception as e:
-            raise ValueError(("Failed to select best "
-                             "pipeline! Exit with error: {}").format(e))
-
-    @property
-    def best_trial_score(self) -> float:
-        '''
-        '''
-        if len(self._trials.trials) > 0:
-            return self._trials.best_trial["result"]["score"]
-        else:
-            return np.nan
-
-    @property
-    def best_trial_score_variance(self) -> float:
-        '''
-        '''
-        if len(self._trials.trials) > 0:
-            return self._trials.best_trial["result"]["score_variance"]
-        else:
-            return np.nan
-
-    @property
-    def best_trial_pipeline(self) -> Pipeline:
-        '''
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        if len(self._trials.trials) > 0:
-
-            return space_eval(
-                    space,
-                    {k: v[0] for k, v in
-                     self._trials.best_trial['misc']['vals'].items()
-                     if len(v) > 0})["pipeline"]
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def _ith_trial_loss(self, i: int) -> float:
-        '''
-        '''
-        if len(self._trials.trials) >= i:
-            return self._trials.trials[i]['result']['loss']
-        else:
-            return np.nan
-
-    def _ith_trial_element(self, i: int, name: str) -> object:
-        '''
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        if len(self._trials.trials) >= i:
-            return space_eval(self._space,
-                              {k: v[0] for k, v in
-                               self._trials.trials[i]['misc']['vals']
-                               .items() if len(v) > 0})[name]
-
-    def _ith_trial_pipeline(self, i: int) -> Pipeline:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='pipeline')
-
-    def _ith_trial_name(self, i: int) -> str:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='name')
-
-    def _ith_trial_params(self, i: int) -> dict:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='params')
-
-    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
-        '''
-        '''
-        if len(self._trials.trials) >= i:
-            return self._trials.trials[i]["result"]["timestamp"]
-
-    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
-        '''
-        Returns the list of n best pipelines
-        documented in trials
-        '''
-        if len(self._trials.trials) > 0:
-            if losses is None:
-                losses = [self._ith_trial_loss(i)
-                          for i in range(len(self._trials.trials))]
-
-            best_n_indices = [losses.index(l)
-                              for l in sorted(list(set(losses)))[:n]]
-
-            return [self._ith_trial_pipeline(i) for i in best_n_indices]
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
-        '''
-        Returns a dictiionry where keys are pipeline names,
-        and values are lists of best pipelines with this name
-        '''
-        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
-
-        if len(self._trials.trials) > 0:
-
-            best_pipelines_per_type = {}
-            names = [self._ith_trial_name(i)
-                     for i in range(len(self._trials.trials))]
-
-            for nm in names:
-                losses = [self._ith_trial_loss(i)
-                          for i in range(len(self._trials.trials))
-                          if self._ith_trial_name(i) == nm]
-
-                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
-                                                        n=n,
-                                                        losses=losses)
-
-            return best_pipelines_per_type
-
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def write_trials_documentation(self, path: str = None):
-        '''
-        Saves an excel file with pipeline names, scores,
-        parameters, and timestamps.
-        '''
-        if len(self._trials.trials) > 0:
-            path = path or "hyperopt_trials_documentation.xlsx"
-
-            assert(isinstance(path, str)),\
-                "Parameter 'path' must be of string type"
-
-            self._assert_valid_directory(path)
-
-            names = [self._ith_trial_name(i)
-                     for i in range(len(self._trials.trials))]
-            scores = [self._score_factor*self._ith_trial_loss(i)
-                      for i in range(len(self._trials.trials))]
-            params = [self._ith_trial_params(i)
-                      for i in range(len(self._trials.trials))]
-            timestamps = [self._ith_trial_timestamp(i)
-                          for i in range(len(self._trials.trials))]
-
-        else:
-            names = []
-            scores = []
-            params = []
-            timestamps = []
-
-        pd.DataFrame({"name": names,
-                      "score": scores,
-                      "params": params,
-                      "timestamp": timestamps})\
-          .to_excel(path)
-
-
-if __name__ == '__main__':
-
-    from sklearn.metrics import roc_auc_score, make_scorer
-    from xgboost import XGBClassifier
-    from sklearn.svm import SVC
-    from sklearn.feature_selection import SelectKBest
-    from sklearn.decomposition import PCA
-    from sklearn.datasets import load_iris
-    from pprint import pprint
-
-    data = load_iris()
-    X = pd.DataFrame(data.data)
-    y = pd.Series(data.target)
-    # produce a binory variable
-    y = (y == 2).astype(int)
-    del data
-    gc.collect()
-
-    # SPACE DEFINITION ########################################
-    # (can be moved to a separate python script)
-
-    """
-    A search space must be a list of dictionaries.
-    Each dictionry must have keys:
-        name (pipeline name or type),
-        pipeline (instance of sklearn.pipeline.Pipeline),
-        params (dictionary of distributions for the parameters of
-                the pipeline that we want to tune)
-
-    Here we have a space that consists of two dictionaries:
-    KBEST_XGBOOST and PCA_SVC
-    """
-    space = []
-
-    pipeline_dist_1 = {}
-    pipeline_dist_1["name"] = "KBEST_XGBOOST"
-
-    """
-    A pipeline consists of steps (tuples).
-    Each step has a name and an algorithm.
-    This pipeline, as a first step performs
-    feature selection with SelectKBest and
-    as a second step evaluates a machine learning algo (xgboost).
-
-    Like all sklearn algorithms, a Pipeline has methods
-    fit, predict, set_params, get_params
-    """
-    pipeline_dist_1["pipeline"] = Pipeline([
-                                     ('kbest', SelectKBest()),
-                                     ('xgb', XGBClassifier())
-                                     ])
-    """
-    Pipeline parameter dictionaries must be of the form:
-    {'kbest__k': 3, xgb__n_estimators: 20},
-    each parameter name consists of the step name, __, and parameter name.
-
-    Here, instead of values, the parameter names are followed
-    by hyperopt distributions.
-    Each hyperopt distribution also must have a name,
-    due to hyperopt functionality.
-
-    Here, we set the hyperopt distribution name to the step name,
-    but it does not have to be so. Hyperopt distribution names
-    must be different for different elements of the space.
-    """
-
-    pipeline_dist_1["params"] = {
-            'kbest__k': hp.choice('kbest__k', range(1, 5)),
-
-            'xgb__n_estimators':
-            50 + hp.randint('xgb__n_estimators', 50),
-
-            "xgb__learning_rate":
-            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
-            }
-
-    space.append(pipeline_dist_1)
-
-    pipeline_dist_2 = {}
-    pipeline_dist_2["name"] = "PCA_SVC"
-
-    pipeline_dist_2["pipeline"] = Pipeline([
-                                     ('pca', PCA()),
-                                     ('svc', SVC(gamma="scale"))
-                                     ])
-
-    pipeline_dist_2["params"] = {
-            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
-
-            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
-            }
-
-    space.append(pipeline_dist_2)
-
-    space = hp.choice('pipelines', space)
-
-    # TESTING ##########################################################
-
-    trials_path = 'TEST_hyperopt_trials.pkl'
-
-    doc_path = 'TEST_hyperopt_doc.xlsx'
-
-    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
-                                       greater_is_better=True,
-                                       trials_path=trials_path)
-
-    hp_obj.attach_data(X_train=X, y_train=y)
-
-    hp_obj.attach_space(space=space)
-
-    hp_obj.search_for_best_pipeline(niter=10)
-
-    print('\n', '='*20, 'TESTING', '='*20)
-
-    print('\n', 'Best score:', hp_obj.best_trial_score)
-
-    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
-
-    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
-
-    print('\n', 'Best 3 pipelines: \n')
-    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
-
-    print('\n', 'Best pipeline per type: \n')
-    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
-
-    hp_obj.write_trials_documentation(path=doc_path)
-
-    # os.remove(doc_path)
-    # os.remove(trials_path)

+ 0 - 130
import_process_instances/CleanProcessTable.py

@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Sep 30 08:55:56 2019
-
-@author: tanya
-"""
-
-import pandas as pd
-import numpy as np
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from libraries.db_migration.MigrationCleaning import MigrationCleaning
-
-
-class CleanTable(MigrationCleaning):
-    '''
-    '''
-
-    def __init__(self, mapping_path: str,
-                 inconsist_report_table: str,
-                 filter_index_columns: (str, list),
-                 sort_columns: list = None,
-                 index_columns: list = None,
-                 log_name: str = "CleanProcessTable"):
-        '''
-        '''
-        super().__init__(
-                mapping_path=mapping_path,
-                schema_paths=[os.path.join(".", "mongo_schema",
-                                           "schema_process_instances.json"),
-                              os.path.join(".", "mongo_schema",
-                                           "schema_wheelsets.json"),
-                              os.path.join(".", "mongo_schema",
-                                           "schema_components.json")],
-                inconsist_report_table=inconsist_report_table,
-                filter_index_columns=filter_index_columns,
-                log_name=log_name)
-
-        self._tablename = os.path.basename(self._mapping_path)\
-                            .split("_mapping")[0]
-
-        self._sort_columns = sort_columns
-        self._index_columns = index_columns
-
-        from libraries.db_handlers.SQLHandler import SQLHandler
-
-        self._sql_db = SQLHandler()
-
-    def read_data(self, wheelsets):
-        '''
-        '''
-        if len(wheelsets) > 1:
-            query = "SELECT * FROM {0} WHERE radsatznummer in {1}"\
-                    .format(self._tablename, tuple(wheelsets))
-        else:
-            query = "SELECT * FROM {0} WHERE radsatznummer = '{1}'"\
-                    .format(self._tablename, wheelsets[0])
-
-        return self._sql_db.read_sql_to_dataframe(query)
-
-    def drop_duplicated_entries(self, data: pd.DataFrame,
-                                columns_to_ignore: list = None
-                                ) -> pd.DataFrame():
-        '''
-        '''
-        if columns_to_ignore is None:
-            columns_to_ignore = ["ende_der_bearbeitung"]
-
-        self.error_column_abscence(columns=columns_to_ignore, data=data)
-
-        defining_columns = [c for c in data.columns
-                            if c not in columns_to_ignore]
-
-        return data.drop_duplicates(subset=defining_columns)\
-                   .reset_index(drop=True)
-
-    @property
-    def field_mapping(self):
-        '''
-        '''
-        return self._mapping_parser.get_field_mapping()
-
-
-class CleanProcessTable(CleanTable):
-    '''
-    '''
-    def __init__(self, mapping_path: str,
-                 inconsist_report_table: str = None,
-                 filter_index_columns=["radsatznummer"],
-                 sort_columns: list = None,
-                 index_columns: list = None,
-                 log_name: str = "CleanProcessTable"):
-        '''
-        '''
-        super().__init__(
-                mapping_path=mapping_path,
-                sort_columns=sort_columns,
-                index_columns=index_columns,
-                inconsist_report_table=inconsist_report_table,
-                filter_index_columns=filter_index_columns,
-                log_name=log_name)
-
-    def _get_next_station_start_time(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        self.error_column_abscence(columns=["radsatznummer", "positionsnummer",
-                                            "begin_der_bearbeitung"],
-                                   data=data)
-
-        data.sort_values(by=["radsatznummer", "begin_der_bearbeitung"],
-                         inplace=True)
-
-        start_time_next_station =\
-            data.groupby("radsatznummer")["begin_der_bearbeitung"].shift(-1)\
-                .fillna("temp")
-
-        station_change = (data.groupby("radsatznummer")["positionsnummer"]
-                              .shift(-1) != data["positionsnummer"])
-
-        start_time_next_station.loc[~station_change] = np.nan
-
-        start_time_next_station.fillna(method="bfill", inplace=True)
-
-        start_time_next_station.loc[start_time_next_station == "temp"] = np.nan
-
-        return pd.to_datetime(start_time_next_station)

+ 0 - 87
import_process_instances/CleanRs0.py

@@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Sep 30 10:14:46 2019
-
-@author: tanya
-"""
-
-import pandas as pd
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from libraries.import_process_instances.CleanProcessTable import CleanTable
-
-
-class CleanRs0(CleanTable):
-    '''
-    '''
-    def __init__(self):
-        '''
-        '''
-        super().__init__(
-                mapping_path=os.path.join(".", "migration_mappings",
-                                          "rs0_mapping.json"),
-                inconsist_report_table="inconsist_rs0",
-                filter_index_columns=["radsatznummer"],
-                sort_columns=["radsatznummer", "eingabe_datum"],
-                index_columns=["radsatznummer", "eingabe_datum"],
-                log_name="CleanRs0:")
-
-    def restrict_to_process_data(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        process_columns = ["radsatznummer", "aufarbeitungstyp", "ihs",
-                           "befundung_code_1", "befundung_code_2",
-                           "befundung_code_3"]
-
-        self.error_column_abscence(columns=process_columns,
-                                   data=data)
-
-        return data[process_columns]
-
-    def add_ist_schrott(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        mongo_name = "final_state.ist_schrott"
-
-        self.error_column_abscence(columns=["aufarbeitungstyp"],
-                                   data=data)
-
-        data[mongo_name] = (data["aufarbeitungstyp"] == 2)
-
-        return data
-
-    def restrict_to_meta_data(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        meta_columns = [c for c in data.columns if c not in
-                        ["aufarbeitungstyp", "ihs",
-                         "befundung_code_1", "befundung_code_2",
-                         "befundung_code_3"]]
-
-        self.error_column_abscence(columns=meta_columns,
-                                   data=data)
-
-        return data[meta_columns]
-
-    def filter_invalid_metacolumns(self, data: pd.DataFrame,
-                                   metacolumns: list = None) -> pd.DataFrame:
-        '''
-        '''
-        if metacolumns is None:
-            metacolumns = ["wellentype", "Lagerbauart", "tauschgruppe"]
-
-        for column in metacolumns:
-
-            invalid_mask = data[column].isnull()
-
-            reason = "Missing {}".format(column)
-
-            data = self._filter_invalid_data(invalid_mask=invalid_mask,
-                                             reason=reason,
-                                             data=data)
-
-        return data

+ 0 - 170
import_process_instances/CleanRs1.py

@@ -1,170 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Sep 30 09:59:54 2019
-
-@author: tanya
-"""
-
-import gc
-import pandas as pd
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
-
-
-class CleanRs1(CleanProcessTable):
-    '''
-    '''
-    def __init__(self):
-        '''
-        '''
-        super().__init__(
-                mapping_path=os.path.join(".", "migration_mappings",
-                                          "rs1_mapping.json"),
-                inconsist_report_table="inconsist_rs1",
-                sort_columns=["radsatznummer", "begin_der_bearbeitung"],
-                index_columns=["radsatznummer"],
-                log_name="CleanRs1")
-
-    def clean_ende_der_bearbeitung(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        We filter all the data that has missing begin_der_bearbeitung
-         (these cases should be very rare),
-         if ende_der_bearbeitung is missing we should fill it by the
-         begin_der_bearbeitung of the next station.
-        '''
-        self.error_column_abscence(columns=["radsatznummer",
-                                            "ende_der_bearbeitung",
-                                            "begin_der_bearbeitung",
-                                            "status"],
-                                   data=data)
-
-        for time_column in ["ende_der_bearbeitung", "begin_der_bearbeitung"]:
-            data[time_column] = pd.to_datetime(data[time_column])
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        start_time_next_station = self._get_next_station_start_time(data=data)
-
-        data["ende_der_bearbeitung"].fillna(start_time_next_station,
-                                            inplace=True)
-
-        del start_time_next_station
-        gc.collect()
-
-        return data
-
-    def filter_invalid_ende_der_bearbeitung(self, data: pd.DataFrame
-                                            ) -> pd.DataFrame:
-        '''
-        '''
-        is_invalid = (
-                (data["ende_der_bearbeitung"].isnull() &
-                 (data["status"] != "Aktiv")) |
-                (data["begin_der_bearbeitung"].isnull()) |
-                (data["ende_der_bearbeitung"] < data["begin_der_bearbeitung"]))
-
-        data = self._filter_invalid_data(
-                    data=data,
-                    invalid_mask=is_invalid,
-                    reason="invalid ende der bearbeitung")
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        return data
-
-    def filter_invalid_status(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        We filter out the cases when work at a station was finished
-         with the status "Aktiv" or "Abbruch". An exception is the very last
-         station per wheel-set because it can a non-finished process.
-        '''
-        self.error_column_abscence(columns=["radsatznummer",
-                                            "positionsnummer",
-                                            "status"],
-                                   data=data)
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        is_station_change = (data["positionsnummer"] !=
-                             data["positionsnummer"].shift(-1))
-
-        is_last_station = (data["radsatznummer"] !=
-                           data["radsatznummer"].shift(-1))
-
-        has_invalid_status = (
-                is_station_change &
-                (~is_last_station) &
-                (data["status"].isin(["Aktiv", "Abbruch"])))
-
-        data = self._filter_invalid_data(
-                    data=data,
-                    invalid_mask=has_invalid_status,
-                    reason="invalid status")
-
-        return data
-
-    def add_finished(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        We add a variable indicating if the process is finished or not
-        '''
-        mongo_name = "final_state.finished"
-
-        self.error_column_abscence(columns=["radsatznummer", "status"],
-                                   data=data)
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        not_finished = ["Aktiv", "Abbruch"]
-
-        last_status_map = data.groupby("radsatznummer")["status"].last()
-
-        data[mongo_name] = ~data["radsatznummer"].map(last_status_map)\
-                                                 .isin(not_finished)
-
-        return data
-
-    def add_stage(self, data: pd.DataFrame) -> pd.DataFrame():
-        '''
-        In the configuration we store the process stages definition in the form
-         of the graph.
-        '''
-        from libraries.configuration import default as cfg
-
-        mongo_name = "process.stage"
-
-        self.error_column_abscence(columns=["radsatznummer", "positionsname"],
-                                   data=data)
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        def cumsum_str(x):
-            return x.cumsum()
-
-        def break_cum_string_to_list(x):
-            return [int(st) for st in x.split("|")[:-1]]
-
-        previous_stations = data\
-            .assign(positionsnummer=data.positionsnummer.astype(str).add("|"))\
-            .groupby("radsatznummer")["positionsnummer"]\
-            .apply(cumsum_str)\
-            .apply(break_cum_string_to_list)
-
-        for stage in cfg.process_stages.nodes():
-            this_stage_stations = cfg.process_stages.nodes()[stage]["stations"]
-            next_stage_stations = [item for next_stage
-                                   in cfg.process_stages.successors(stage)
-                                   for item in cfg.process_stages.nodes()
-                                   [next_stage]["stations"]]
-
-            def check_stage(x):
-                return (len(set(this_stage_stations) & set(x)) != 0) and \
-                       (len(set(next_stage_stations) & set(x)) == 0)
-
-            data.loc[previous_stations.apply(check_stage), mongo_name] = stage
-
-        return data

+ 0 - 82
import_process_instances/CleanRs2.py

@@ -1,82 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Sep 30 10:06:48 2019
-
-@author: tanya
-"""
-
-import pandas as pd
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
-
-
-class CleanRs2(CleanProcessTable):
-    '''
-    '''
-    def __init__(self):
-        '''
-        '''
-        super().__init__(
-                mapping_path=os.path.join(".", "migration_mappings",
-                                          "rs2_mapping.json"),
-                inconsist_report_table="inconsist_rs2",
-                sort_columns=["radsatznummer", "ende_der_bearbeitung"],
-                index_columns=["radsatznummer", "positionsnummer"],
-                log_name="CleanRs2")
-
-    def filter_invalid_ende_der_bearbeitung(self, data: pd.DataFrame
-                                            ) -> pd.DataFrame:
-        '''
-        We filter out all the rows that have missing ende_der_bearbeitung,
-         it means that the activities we planned, but not executed.
-        '''
-        self.error_column_abscence(columns=["radsatznummer",
-                                            "ende_der_bearbeitung"],
-                                   data=data)
-
-        is_invalid = (data["ende_der_bearbeitung"].isnull())
-
-        data = self._filter_invalid_data(
-                    data=data,
-                    invalid_mask=is_invalid,
-                    reason="invalid ende der bearbeitung")
-
-        data["ende_der_bearbeitung"] =\
-            pd.to_datetime(data["ende_der_bearbeitung"])
-
-        return data
-
-    def filter_invalid_taetigkeitsname(self, data: pd.DataFrame
-                                       ) -> pd.DataFrame:
-        '''
-        In the configuration we store a list of activities
-         execution of which means that the wheel-set is scrap.
-         After execution of this activities the process history should end.
-        '''
-        from libraries.configuration import default as cfg
-
-        self.error_column_abscence(columns=["radsatznummer",
-                                            "taetigkeitsname"],
-                                   data=data)
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        is_last_station = (
-            data["radsatznummer"] !=
-            data["radsatznummer"].shift(-1))
-
-        is_invalid = (
-                ~is_last_station &
-                (data["taetigkeitsname"].isin(cfg.schrott_taetigkeiten)))
-
-        data = self._filter_invalid_data(
-                    data=data,
-                    invalid_mask=is_invalid,
-                    reason="invalid taetigkeit")
-
-        return data

+ 0 - 58
import_process_instances/CleanRs70.py

@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Sep 30 10:11:55 2019
-
-@author: tanya
-"""
-import pandas as pd
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from libraries.import_process_instances.CleanProcessTable import CleanProcessTable
-
-
-class CleanRs70(CleanProcessTable):
-    '''
-    '''
-    def __init__(self):
-        '''
-        '''
-        super().__init__(
-                mapping_path=os.path.join(".", "migration_mappings",
-                                          "rs70_mapping.json"),
-                inconsist_report_table="inconsist_rs70",
-                sort_columns=["radsatznummer", "eingabe_datum"],
-                index_columns=["radsatznummer", "eingabe_datum"],
-                log_name="CleanRs70")
-
-    def filter_invalid_schadcode(self, data: pd.DataFrame) -> pd.DataFrame:
-        '''
-        In the configuration we store a list of schadcodes assignment
-         of which means that the product is scrap. No more schadcodes after
-         this schadcode should be assigned.
-        '''
-        from libraries.configuration import default as cfg
-
-        self.error_column_abscence(columns=["radsatznummer", "schadcode"],
-                                   data=data)
-
-        data.sort_values(by=self._sort_columns, inplace=True)
-
-        is_last_schadcode = (data["radsatznummer"] !=
-                             data["radsatznummer"].shift(-1))
-
-        is_invalid = (~is_last_schadcode &
-                      data["schadcode"].isin(cfg.schrott_schadcodes))
-
-        data = self._filter_invalid_data(
-                    data=data,
-                    invalid_mask=is_invalid,
-                    reason="invalid schadcode")
-
-        # XXX temporary here
-        # data["eingabe_datum"] = pd.to_datetime(data["eingabe_datum"])
-
-        return data

+ 0 - 149
import_process_instances/MergeProcessTables.py

@@ -1,149 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Mon Sep 30 10:16:23 2019
-
-@author: tanya
-"""
-
-import pandas as pd
-import numpy as np
-import os
-import sys
-
-sys.path.append(os.getcwd())
-
-from libraries.import_process_instances.CleanRs1 import CleanRs1
-
-
-class MergeProcessTables:
-    '''
-    '''
-    def merge_rs2(self, data: pd.DataFrame, rs2: pd.DataFrame) -> pd.DataFrame:
-        '''
-        Difficulty: rows that correspond to one radsatznummer and one station
-         in rs1 and rs2 are in many-to-many relation and
-         the ende_der_bearbeitung
-         for these rows often does not match in the two tables.
-
-        Rules:
-            A) We check if the end_der_bearbeitung of
-            an activity from rs2 is >=
-            begin_der_bearbeitung and <= of
-            ende_der_bearbeitung of an entry in rs1
-
-            B) If an activity (row in rs2) has ende_der_bearbeitung
-            which is later
-            than ende_der_bearbeitung of all the entries in rs1, we check if it
-            ended earlier than the begin_der_bearbeitung on the next station.
-            If it
-            is so, we assign the activity to the latest entry in rs1 for
-            this station.
-
-        Same logic applies for merging the table rs70.
-        '''
-        data = data.copy(deep=True)
-        rs2 = rs2.copy(deep=True)
-
-        station_change = (data["positionsnummer"] !=
-                          data["positionsnummer"].shift(-1))
-
-        data["order"] = data.index
-
-        common_columns = ["radsatznummer", "positionsnummer",
-                          "positionsname"]
-
-        data = pd.merge(data, rs2, how="left", on=common_columns)
-
-        start_time_next_station =\
-            CleanRs1()._get_next_station_start_time(data)\
-                      .fillna(data["ende_der_bearbeitung_x"])
-
-        start_matches = (data["ende_der_bearbeitung_y"] >=
-                         data["begin_der_bearbeitung"])
-
-        end_matches = ((data["ende_der_bearbeitung_y"] <=
-                       data["ende_der_bearbeitung_x"]) |
-                       data["ende_der_bearbeitung_y"].isnull())
-
-        end_almost_matches = ((data["ende_der_bearbeitung_y"] <=
-                               start_time_next_station) &
-                              station_change
-                              )
-
-        time_matches = (start_matches & end_matches) |\
-                       (start_matches & (~end_matches) & end_almost_matches)
-
-        rs2_columns = [c for c in rs2.columns
-                       if (c not in common_columns) and (c in data.columns)] +\
-                      [c + "_y" for c in rs2.columns
-                       if c + "_y" in data.columns]
-
-        for c in rs2_columns:
-            data.loc[~time_matches, c] = np.nan
-
-        data.sort_values(by=["radsatznummer",
-                             "begin_der_bearbeitung",
-                             "ende_der_bearbeitung_y"],
-                         inplace=True)
-
-        # we keep all the rows that were in rs1 even if there are no
-        # corresponding activities from rs2
-        keep_row = time_matches | (~data["order"].duplicated(keep="first"))
-
-        data = data.loc[keep_row].copy(deep=True).reset_index(drop=True)
-
-        data["ende_der_bearbeitung"] = data[["ende_der_bearbeitung_x",
-                                             "ende_der_bearbeitung_y"]]\
-            .max(axis=1)
-
-        data.drop(["ende_der_bearbeitung_x", "ende_der_bearbeitung_y",
-                   "order"], axis=1, inplace=True)
-
-        return data
-
-    def merge_rs70(self, data: pd.DataFrame, rs70: pd.DataFrame
-                   ) -> pd.DataFrame:
-        '''
-        '''
-        data["order"] = data.index
-
-        data = pd.merge(data, rs70, how="left", on="radsatznummer")
-
-        time_matches = (
-                (data["eingabe_datum"] >= data["begin_der_bearbeitung"]) &
-                (data["eingabe_datum"] <= data["ende_der_bearbeitung"]))
-
-        rs70_columns = [c for c in rs70.columns
-                        if (c != "radsatznummer") and (c in data.columns)] +\
-                       [c + "_y" for c in rs70.columns
-                        if c + "_y" in data.columns]
-
-        for c in rs70_columns:
-            data.loc[~time_matches, c] = np.nan
-
-        data.sort_values(by=["radsatznummer", "begin_der_bearbeitung",
-                             "eingabe_datum"], inplace=True)
-
-        keep_row = time_matches | (~data["order"].duplicated(keep="first"))
-
-        data = data.loc[keep_row]\
-                   .drop("order", axis=1)\
-                   .reset_index(drop=True)
-
-        return data
-
-    def merge_rs0(self, data: pd.DataFrame, rs0: pd.DataFrame) -> pd.DataFrame:
-        '''
-        '''
-        data = pd.merge(data, rs0, how="left", on="radsatznummer")
-
-        no_befundung_mask = (data["positionsnummer"] != 110)
-
-        for column in ["befundung_code_1",
-                       "befundung_code_2",
-                       "befundung_code_3"]:
-
-            data.loc[no_befundung_mask, column] = np.nan
-
-        return data

BIN
import_process_instances/__pycache__/CleanProcessTable.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs0.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs1.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs2.cpython-37.pyc


BIN
import_process_instances/__pycache__/CleanRs70.cpython-37.pyc


BIN
import_process_instances/__pycache__/MergeProcessTables.cpython-37.pyc


BIN
import_process_instances/__pycache__/parallelized_import.cpython-37.pyc


+ 0 - 74
import_process_instances/parallelized_import.py

@@ -1,74 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Tue Oct  1 11:15:03 2019
-
-@author: tanya
-"""
-
-import os
-import sys
-from typing import Callable
-sys.path.append(os.getcwd())
-
-
-def get_all_wheelsets():
-    '''
-    return: list of distinct wheelset numbers in the process
-    '''
-    from libraries.db_handlers.SQLHandler import SQLHandler
-
-    sql_db = SQLHandler()
-
-    query = "SELECT DISTINCT radsatznummer FROM rs1"
-
-    return sql_db.read_sql_to_dataframe(query)["radsatznummer"].tolist()
-
-
-def parallelized_import(all_instances: list,
-                        mongo_schema_path: str,
-                        import_chunk: Callable,
-                        log_name: str = None):
-
-    from concurrent.futures import ThreadPoolExecutor
-
-    from libraries.db_handlers.MongodbHandler import MongodbHandler
-
-    from libraries.log import Log
-
-    import argparse
-
-    argparser = argparse.ArgumentParser(description='Import process instances collection')
-    argparser.add_argument('--chunksize', type=int, default=100, help="Number of wheelsets processed at a time")
-    argparser.add_argument('--max_workers', type=int, default=10, help="Number of workers in ThreadPoolExecutor")
-    args = argparser.parse_args()
-
-    log = Log(log_name)
-
-    log.info("Start application")
-    log.info("Processing {0} wheelsets at a time parallelized with {1} workers"
-             .format(args.chunksize, args.max_workers))
-
-    collection_name = os.path.basename(mongo_schema_path).strip("schema_").split(".")[0]
-
-    mongodb = MongodbHandler()
-
-    mongodb.create_collection_and_set_schema(
-            collection_name=collection_name,
-            schema_path=mongo_schema_path)
-
-    try:
-        n_chunks = len(all_instances)//args.chunksize + 1
-
-        with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
-            for i in range(n_chunks):
-                executor.submit(import_chunk,
-                                all_instances[i*args.chunksize:(i+1)*args.chunksize], i)
-
-    except Exception as e:
-        err = ("Failed to import {0} in mongodb. "
-               "Exit with error: {1}".format(collection_name, e))
-        log.error(err)
-        raise Exception(e)
-
-    log.info("Finished application")

+ 0 - 58
log.py

@@ -1,58 +0,0 @@
-# -*- coding: utf-8 -*-
-"""
-@author: jürgen.pannosch, tanja.zolotareva
-"""
-
-import sys
-import os
-import logging
-
-
-class Log:
-    def __init__(self, name: str = None,
-                 log_file: str = None,
-                 log_level: str = "INFO",
-                 print_to_stdout: bool = True):
-        """Sets the log level and the path where the log file is stored
-
-        :param log_file: Path to the log file.
-        :param log_level: Log level."""
-
-        if name is None:
-            name = ''
-
-        self._logger = logging.getLogger(name)
-
-        if (self._logger.hasHandlers()):
-            self._logger.handlers.clear()
-
-        if log_file is None:
-            log_file = os.path.join(".", "all.log")
-
-        assert(isinstance(log_file, str)),\
-            "Parameter 'log_path' must be of string type"
-
-        formatter = logging.Formatter(
-                '\n %(name)s %(asctime)s %(levelname)s %(message)s')
-
-        os.makedirs(os.path.dirname(log_file), exist_ok=True)
-
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setFormatter(formatter)
-        self._logger.addHandler(file_handler)
-
-        if print_to_stdout:
-            stream_handler = logging.StreamHandler(sys.stdout)
-            stream_handler.setFormatter(formatter)
-            self._logger.addHandler(stream_handler)
-
-        self._logger.setLevel(log_level)
-
-    def info(self, message: str):
-        self._logger.info(message)
-
-    def warning(self, message: str):
-        self._logger.warning(message)
-
-    def error(self, message: str):
-        self._logger.error(message)

+ 0 - 73
utils/ClassLogging.py

@@ -1,73 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Sep 27 14:20:58 2019
-
-@author: tanya
-"""
-
-import os
-import sys
-import pandas as pd
-sys.path.append(os.getcwd())
-
-
-class ClassLogging:
-    '''
-    '''
-    def __init__(self, log_name: str = None):
-        '''
-        '''
-        from libraries.log import Log
-
-        self._log = Log(log_name)
-
-    def log_and_raise(self, message):
-        '''
-        '''
-        self._log.error(message)
-
-        raise Exception(message)
-
-    def log_and_warn(self, message):
-        '''
-        '''
-        self._log.warning(message)
-
-    def check_is_file(self, path):
-        '''
-        '''
-        if not os.path.isfile(path):
-            err = "File {} not found".format(path)
-            self._log.error(err)
-            raise FileNotFoundError(err)
-
-    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
-                               error_or_warning: str):
-        '''
-        '''
-        if isinstance(columns, str):
-            columns = [columns]
-
-        for column in columns:
-
-            if column not in data.columns:
-                err = ("{} is not an internal column name".format(column))
-                getattr(self._log, error_or_warning)(err)
-
-                if error_or_warning == "error":
-                    raise Exception(err)
-
-    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame):
-        '''
-        '''
-        return self._check_column_abscence(columns=columns,
-                                           data=data,
-                                           error_or_warning="error")
-
-    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
-        '''
-        '''
-        return self._check_column_abscence(columns=columns,
-                                           data=data,
-                                           error_or_warning="warning")

+ 0 - 62
utils/CleaningUtils.py

@@ -1,62 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Sep 27 16:20:03 2019
-
-@author: tanya
-"""
-
-import pandas as pd
-import numpy as np
-
-
-class CleaningUtils:
-    '''
-    '''
-    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
-        '''
-        '''
-        formats = list(formats)
-
-        converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
-
-        for formt in formats:
-            if formt == "%d%m%Y":
-                missing_leading_zero = (series.astype(str).str.len() == 7)
-
-                series = series.astype(str)
-
-                series.loc[missing_leading_zero] = "0" +\
-                    series.loc[missing_leading_zero]
-
-            converted_this_format = pd.to_datetime(series,
-                                                   format=formt,
-                                                   errors="coerce")
-
-            converted.fillna(converted_this_format, inplace=True)
-
-        return converted
-
-    def standarize_writing(self, s: str):
-        '''
-        '''
-        import re
-
-        german_character_mapping = {"ß": "ss",
-                                    "ü": "ue",
-                                    "Ü": "Ue",
-                                    "ä": "ae",
-                                    "Ä": "Ae",
-                                    "ö": "oe",
-                                    "Ö": "Oe"}
-
-        s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
-        for char, correct_char in german_character_mapping.items():
-            s = s.replace(char, correct_char)
-
-        s = s.lower()
-
-        s = re.sub('[^0-9a-zA-Z]+', '_', s)
-
-        return s
-

BIN
utils/__pycache__/ClassLogging.cpython-37.pyc


BIN
utils/__pycache__/CleaningUtils.cpython-37.pyc