4 years ago · 4109ebff20
--- a/README.md
+++ b/README.md
--- a/cdplib/DataExplorer/DataExplorer.py
+++ b/cdplib/DataExplorer/DataExplorer.py
--- a/cdplib/FlattenData.py
+++ b/cdplib/FlattenData.py
--- a/cdplib/Singleton_Threadsafe.py
+++ b/cdplib/Singleton_Threadsafe.py
--- a/cdplib/__init__.py
+++ b/cdplib/__init__.py
--- a/cdplib/db_handlers/InfluxdbHandler.py
+++ b/cdplib/db_handlers/InfluxdbHandler.py
@@ -0,0 +1,117 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Tue Feb 23 19:44:22 2021
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from influxdb import DataFrameClient
			
 
				+
			
 
				+
			
 
				+class InfluxdbHandler:
			
 
				+    """
			
 
				+    """
			
 
				+    def __init__(self, database_url: str = None):
			
 
				+        """
			
 
				+        :param database_url: DESCRIPTION
			
 
				+        :type database_url: str
			
 
				+        :return: DESCRIPTION
			
 
				+        :rtype: TYPE
			
 
				+        """
			
 
				+        self._logger = Log("InfluxdbHandler:")
			
 
				+
			
 
				+        if database_url is None:
			
 
				+            database_url = self._read_url_from_env()
			
 
				+
			
 
				+        self.client = DataFrameClient.from_dsn(database_url)
			
 
				+
			
 
				+    def _read_url_from_env(self) -> str:
			
 
				+        """
			
 
				+        :return: parse database url from the configuration object.
			
 
				+         configuration object is create by the script
			
 
				+         /libraries.configuration.py and uses a configuration file
			
 
				+         (by default .env)
			
 
				+        :rtype: str
			
 
				+
			
 
				+        """
			
 
				+        try:
			
 
				+            from libraries.configuration import default as cfg
			
 
				+
			
 
				+            assert(cfg["INFLUX"] is not None),\
			
 
				+                "configuration file must contain [INFLUX]"
			
 
				+
			
 
				+            assert(set(["INFLUX_HOST", "INFLUX_PORT", "INFLUX_DATABASE_NAME"])
			
 
				+                   <= set(cfg["INFLUX"])),\
			
 
				+                ("configuration file must contain influx host, ",
			
 
				+                 " port, and database name")
			
 
				+
			
 
				+            database_url = "influxdb://"
			
 
				+
			
 
				+            if "INFLUX_USER" in cfg["INFLUX"]:
			
 
				+                database_url += cfg["INFLUX"]["INFLUX_USER"]
			
 
				+
			
 
				+            if "INFLUX_PASSWORD" in cfg["INFLUX"]:
			
 
				+                database_url += ":" + cfg["INFLUX"]["INFLUX_PASSWORD"]
			
 
				+
			
 
				+            database_url += "@{0}:{1}/{2}".format(
			
 
				+                cfg["INFLUX"]["INFLUX_HOST"],
			
 
				+                cfg["INFLUX"]["INFLUX_PORT"],
			
 
				+                cfg["INFLUX"]["INFLUX_DATABASE_NAME"])
			
 
				+
			
 
				+            return database_url
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._logger.log_and_raise_error(
			
 
				+                ("Could not parse url from configuration file. "
			
 
				+                 "Exit with error {}".format(e)))
			
 
				+
			
 
				+    def query_to_dataframe(self, query: str) -> pd.DataFrame:
			
 
				+        """
			
 
				+        :param query: DESCRIPTION
			
 
				+        :type query: str
			
 
				+        :return: DESCRIPTION
			
 
				+        :rtype: TYPE
			
 
				+        """
			
 
				+        try:
			
 
				+            # result of the query is a defaultdict
			
 
				+            result = self.client.query(query)
			
 
				+
			
 
				+            return list(result.values())[0]
			
 
				+        except Exception as e:
			
 
				+            self._logger.log_and_raise_error(
			
 
				+                ("Could not query to dataframe. "
			
 
				+                 "Exit with error {}".format(e)))
			
 
				+
			
 
				+    def query_between_dates(self, columns: str,
			
 
				+                            tables: str,
			
 
				+                            start: str,
			
 
				+                            stop: str) -> pd.DataFrame:
			
 
				+        """
			
 
				+        :param columns: DESCRIPTION
			
 
				+        :type columns: str
			
 
				+        :param tables: DESCRIPTION
			
 
				+        :type tables: str
			
 
				+        :param start: DESCRIPTION
			
 
				+        :type start: str
			
 
				+        :param stop: DESCRIPTION
			
 
				+        :type stop: str
			
 
				+        :return: DESCRIPTION
			
 
				+        :rtype: TYPE
			
 
				+
			
 
				+        """
			
 
				+        query = 'SELECT ' +\
			
 
				+                columns +\
			
 
				+                ' FROM \"' +\
			
 
				+                tables +\
			
 
				+                '\" WHERE time > \'' +\
			
 
				+                str(start) +\
			
 
				+                '\' AND time  < \'' +\
			
 
				+                str(stop) +\
			
 
				+                '\' tz(\'Europe/Berlin\');'
			
 
				+
			
 
				+        return self.query_to_dataframe(query)
			
--- a/cdplib/db_handlers/MongodbHandler.py
+++ b/cdplib/db_handlers/MongodbHandler.py
--- a/cdplib/db_handlers/__init__.py
+++ b/cdplib/db_handlers/__init__.py
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -255,9 +255,11 @@ class MigrationCleaning:
 
				             columns = db.get_column_names(tablename=self._inconsist_report_table)
			
 
				 
			
 
				             if len(columns) > 0:
			
 
				-                columns_not_in_data = [column for column in columns if column not in data.columns]
			
 
				-                for value in columns_not_in_data:
			
 
				-                    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
			
 
				+                # TODO Tanya:The commented lines caused the reason to be the same for all entries.
			
 
				+
			
 
				+                #columns_not_in_data = [column for column in columns if column not in data.columns]
			
 
				+                #for value in columns_not_in_data:
			
 
				+                #    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
			
 
				                 data_inconsist = data_inconsist[columns]
			
 
				 
			
 
				         db.append_to_table(data=data_inconsist,
			
@@ -396,7 +398,7 @@ class MigrationCleaning:
 
				                     data[column] = data[column].astype(python_type)
			
 
				 
			
 
				                 elif python_type == float:
			
 
				-                    
			
 
				+
			
 
				                     data[column] = data[column].fillna(np.inf)
			
 
				                     # Replaces empty fields when type is string
			
 
				                     if data[column].dtypes == object:
			
@@ -564,15 +566,15 @@ class MigrationCleaning:
 
				         return data
			
 
				 
			
 
				     def clean_json_from_None_object(self, data: pd.DataFrame, clean_bool: bool = True) -> pd.DataFrame():
			
 
				-        
			
 
				+
			
 
				         data = data.to_json(date_format="iso")
			
 
				         data = json.loads(data)
			
 
				         new_data = remap(data, lambda p, k, v: v is not None)
			
 
				         new_data = remap(new_data, lambda p, k, v: v != 'None')
			
 
				         new_data = remap(new_data, lambda p, k, v: v != 'inf')
			
 
				-        # cleans not only bool type also int which are 0 or 1 
			
 
				+        # cleans not only bool type also int which are 0 or 1
			
 
				         # only use if it is necessary have to be change that it only considers
			
 
				-        # Ture and False for bools 
			
 
				+        # Ture and False for bools
			
 
				         if clean_bool:
			
 
				             new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
			
 
				         return new_data
			
@@ -588,27 +590,27 @@ class MigrationCleaning:
 
				 
			
 
				 
			
 
				     def map_toleranzen_values(self, data: pd.DataFrame, toleranzen: pd.DataFrame):
			
 
				-        
			
 
				+
			
 
				         toleranzen.drop('nr', axis=1, inplace=True)
			
 
				-        
			
 
				+
			
 
				         toleranzen.columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.geometrie.durchmesser.min', 'wellenschenkel.geometrie.durchmesser.max', 'innenring.geometrie.durchmesser.min',
			
 
				                         'innenring.geometrie.durchmesser.max', 'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
			
 
				 
			
 
				-        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max', 
			
 
				+        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max',
			
 
				                                     'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
			
 
				-        
			
 
				+
			
 
				         labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.geometrie.durchmesser.min', 'labyrinthring.geometrie.durchmesser.max']
			
 
				-        
			
 
				+
			
 
				         reparatur_stufe_labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.reparatur_stufe.durchmesser.min', 'labyrinthring.reparatur_stufe.durchmesser.max']
			
 
				 
			
 
				-        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min', 
			
 
				+        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min',
			
 
				                                     'wellenschenkel.reparatur_stufe.durchmesser.max', 'innenring.reparatur_stufe.durchmesser.min',
			
 
				-                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min', 
			
 
				+                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min',
			
 
				                                     'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.max']
			
 
				 
			
 
				-        
			
 
				+
			
 
				         toleranzen_reference_columns = ['wellenschenkel_toleranz', 'labyrinthring_toleranz', 'wellen_reparatur_stufe_toleranz', 'labyrinthring_reparatur_stufe_toleranz']
			
 
				-        
			
 
				+
			
 
				         available_columns = [column for column in data.columns if column in toleranzen_reference_columns]
			
 
				         for column in available_columns:
			
 
				             merge_map = [False] *len(data.index)
			
@@ -623,13 +625,13 @@ class MigrationCleaning:
 
				 
			
 
				                     else:
			
 
				                         temp_toleranzen.columns = labyrinten_columns
			
 
				-                
			
 
				+
			
 
				                 elif 'reparatur_stufe' in column:
			
 
				                     temp_toleranzen.columns = reparatur_stufe_columns
			
 
				                     merge_map = data['innenring_reparatur_stufe_zulaessig'] == 'Ja'
			
 
				                 data_before = len(data.index)
			
 
				                 data = data.merge(temp_toleranzen, how='left', left_on=column, right_on='toleranzbez_wellen_reference')
			
 
				-                data.loc[merge_map, temp_toleranzen.columns] = np.nan 
			
 
				+                data.loc[merge_map, temp_toleranzen.columns] = np.nan
			
 
				                 if data_before != len(data.index):
			
 
				                     print('WEVE LOST DATA!!')
			
 
				                     print('before:', data_before, 'now:', len(data.index))
			
@@ -641,9 +643,9 @@ class MigrationCleaning:
 
				 
			
 
				     def label_is_level(
			
 
				                     self,
			
 
				-                    data: pd.DataFrame, 
			
 
				-                    column: str = "is", 
			
 
				-                    include_schrott: bool = False, 
			
 
				+                    data: pd.DataFrame,
			
 
				+                    column: str = "is",
			
 
				+                    include_schrott: bool = False,
			
 
				                     drop_rows_with_no_is: bool = False) -> pd.DataFrame:
			
 
				         '''
			
 
				         '''
			
@@ -659,16 +661,16 @@ class MigrationCleaning:
 
				                 data.loc[data[column].isin(v), column] = k
			
 
				             else:
			
 
				                 data.loc[data[column].isnull(), column] = k
			
 
				-        
			
 
				+
			
 
				         if include_schrott and ("operation_type_2" in data.columns):
			
 
				             schrott_mask = (data["operation_type_2"] == 2)
			
 
				             data.loc[schrott_mask, column] = 5
			
 
				-        
			
 
				+
			
 
				         data.loc[~data[column].isin([0,1,2,3,4,5]), column] = 0
			
 
				-                    
			
 
				+
			
 
				         if drop_rows_with_no_is:
			
 
				             data = data.loc[data[column] != 0].copy(deep=True)
			
 
				-            
			
 
				+
			
 
				         return data.reset_index(drop=True)
			
 
				 
			
 
				 
			
--- a/cdplib/db_migration/ParseDbSchema.py
+++ b/cdplib/db_migration/ParseDbSchema.py
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
--- a/cdplib/db_migration/ParseMapping.py
+++ b/cdplib/db_migration/ParseMapping.py
--- a/cdplib/feature_engineering/StatisticalFeatures.py
+++ b/cdplib/feature_engineering/StatisticalFeatures.py
@@ -0,0 +1,270 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+""" 
			
 
				+Created on Tue Oct 16 16:08:47 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+import types
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+
			
 
				+from collections import defaultdict
			
 
				+from functools import reduce
			
 
				+
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+from libraries.exception_handling import InputChecks
			
 
				+          
			
 
				+class StatisticalFeatures:
			
 
				+    '''
			
 
				+    Groups data by index columns and returns aggregated statistics for given columns
			
 
				+    
			
 
				+    :param list of tuples or dict index_cols: 
			
 
				+        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
			
 
				+                                             (colname_2, aggfunc_3)]
			
 
				+        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
			
 
				+        where colname_i is column to aggregate and aggfunc_i are either 
			
 
				+        function variables or strings accepted by pandas for built-in function names.
			
 
				+        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
			
 
				+        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
			
 
				+        then only the first one is kept.
			
 
				+        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
			
 
				+        
			
 
				+    '''
			
 
				+    def __init__(self, data, index_cols, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        configure_logging(path_to_log)
			
 
				+            
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        
			
 
				+        self.checks = InputChecks(logger = self.logger)
			
 
				+        
			
 
				+        self.data = data
			
 
				+        
			
 
				+        self.checks.assert_correct_type({'data', [pd.DataFrame]})
			
 
				+            
			
 
				+        self.index_cols = index_cols
			
 
				+        
			
 
				+        # make warning about missing values in index columns
			
 
				+        for col in self.index_cols:
			
 
				+            if data[col].isnull().any():
			
 
				+                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
			
 
				+
			
 
				+        
			
 
				+    def get_kpis_by_aggregation(self, kpis):
			
 
				+        '''
			
 
				+        Aggregates given fields with given aggregation functions
			
 
				+         USE CASE: per product find mean and standard variation of a price
			
 
				+        
			
 
				+        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
			
 
				+         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
			
 
				+         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
			
 
				+         
			
 
				+        :return: features with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        def get_valid_agg_dict_from_kpis(kpis):
			
 
				+            '''
			
 
				+            Filters inputs of incorrect shape or type,
			
 
				+            Filters out columns not present in data
			
 
				+            Removes multiple functions with the same name
			
 
				+            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
			
 
				+            Reports to the log
			
 
				+            :param list or dict kpis:
			
 
				+            '''
			
 
				+            def get_name(x):
			
 
				+                '''
			
 
				+                Returns function name for function and does nothing for string
			
 
				+                '''
			
 
				+                if isinstance(x, types.FunctionType):
			
 
				+                    return x.__name__
			
 
				+                else:
			
 
				+                    return x
			
 
				+                
			
 
				+            def passed_first_line_type_control(col, aggfunc):
			
 
				+                '''
			
 
				+                Checks if aggregation works on the first 2 lines of the data
			
 
				+                '''
			
 
				+                try:
			
 
				+                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
			
 
				+                    self.data.iloc[:2]\
			
 
				+                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
			
 
				+                             .groupby(self.index_cols)\
			
 
				+                             .agg({col : aggfunc})
			
 
				+                    return True
			
 
				+                except Exception as e:
			
 
				+                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
			
 
				+                    return False
			
 
				+           
			
 
				+            
			
 
				+            
			
 
				+            valid_kpi_dict = defaultdict(list)
			
 
				+            
			
 
				+            if isinstance(kpis, list):
			
 
				+                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
			
 
				+                if sum(incorrect_lengths) > 0:
			
 
				+                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
			
 
				+                
			
 
				+                cols = list(zip(*kpis))[0]             
			
 
				+                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
			
 
				+            elif isinstance(kpis, dict):
			
 
				+                cols = list(kpis.keys())
			
 
				+                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
			
 
				+                
			
 
				+            cols_not_in_data = set(cols) - set(self.data.columns)
			
 
				+            if len(cols_not_in_data) > 0:
			
 
				+                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
			
 
				+                
			
 
				+            for col, aggfuncs in kpis:
			
 
				+                if not isinstance(aggfuncs, list):
			
 
				+                    aggfuncs = [aggfuncs]
			
 
				+                
			
 
				+                for aggfunc in aggfuncs:
			
 
				+                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
			
 
				+                    if not is_new_funcname:
			
 
				+                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
			
 
				+                    
			
 
				+                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
			
 
				+                        valid_kpi_dict[col].append(aggfunc)
			
 
				+                    
			
 
				+            return valid_kpi_dict
			
 
				+                   
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        agg_dict = get_valid_agg_dict_from_kpis(kpis)
			
 
				+        
			
 
				+        if len(agg_dict) > 0:
			
 
				+        
			
 
				+            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
			
 
				+                             else '_'.join([col, str(aggfunc)]) 
			
 
				+                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
			
 
				+            
			
 
				+            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
			
 
				+            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
			
 
				+                       .groupby(self.index_cols)\
			
 
				+                       .agg(agg_dict)\
			
 
				+                       .set_axis(new_names, axis = 'columns', inplace = False)\
			
 
				+                       .reset_index()
			
 
				+        else:
			
 
				+            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
			
 
				+        '''
			
 
				+        A wrapper crosstab method with index equal to index_cols
			
 
				+        USE CASE: per product find standart variation of the price in each city
			
 
				+        
			
 
				+        :param str pivot_col: column values of which become columns in the output
			
 
				+        :param str value_col: column name to fillin vlaues
			
 
				+        :param str or func aggfunc: count if None
			
 
				+        :param list entries: values of pivot_col to show
			
 
				+        :return: table with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        
			
 
				+        # assert that types of the inputs are correct
			
 
				+        types_to_check = {'columns' : [str], 
			
 
				+                          'value_col' : [str, type(None)],  
			
 
				+                          'aggfunc' : ['str', types.FunctionType, type(None)], 
			
 
				+                          'entries' : [list, type(None)]}
			
 
				+        
			
 
				+        self.checks.assert_correct_type(types_to_check)
			
 
				+        
			
 
				+        cols_to_check = [pivot_col]
			
 
				+        if not value_col is None:
			
 
				+            cols_to_check.append(value_col)
			
 
				+        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
			
 
				+
			
 
				+        if not entries is None:
			
 
				+            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
			
 
				+        else:
			
 
				+            entry_filter = pd.Series([True]*len(self.data))              
			
 
				+    
			
 
				+        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
			
 
				+        columns = self.data.loc[entry_filter, pivot_col]
			
 
				+        if not value_col is None:
			
 
				+            value_col = self.data.loc[entry_filter, value_col]
			
 
				+                        
			
 
				+        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
			
 
				+        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
			
 
				+                       .reset_index()
			
 
				+        return result
			
 
				+    
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+        
			
 
				+    
			
 
				+    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
			
 
				+        '''
			
 
				+        Aggregates values obtained with method get_value_stats
			
 
				+         USE CASE: per product find average variation of the price over all cities
			
 
				+         
			
 
				+        :param str pivot_col:
			
 
				+        :param str value_col:
			
 
				+        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
			
 
				+        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
			
 
				+        :param list entries: 
			
 
				+        :return: table with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
			
 
				+        
			
 
				+        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
			
 
				+
			
 
				+        result = value_stat_kpis[self.index_cols].copy(deep = True)
			
 
				+        
			
 
				+        for aggfunc in aggfuncs_step2:
			
 
				+            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
			
 
				+            
			
 
				+            if isinstance(aggfunc, str):
			
 
				+                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
			
 
				+            else:
			
 
				+                result[colname] = value_stat_kpis.set_index(self.index_cols)\
			
 
				+                                                 .apply(aggfunc, axis = 1)\
			
 
				+                                                 .reset_index(drop = True)
			
 
				+                                                 
			
 
				+        return result
			
 
				+                              
			
 
				+                              
			
 
				+                              
			
 
				+                              
			
 
				+                                                            
			
 
				+    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
			
 
				+        '''
			
 
				+        Finds argmin or argmax of a column
			
 
				+         USE CASE: per product find the city with maximum variation of the price
			
 
				+        
			
 
				+        :param str min_or_max: must be in ['min', 'max']
			
 
				+        :param str pivot_col:
			
 
				+        :param str value_col:
			
 
				+        :param str aggfunc:    
			
 
				+        '''
			
 
				+        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
			
 
				+        
			
 
				+        if min_or_max == 'max':
			
 
				+            aggfuncs_step2 = ['idxmax']
			
 
				+        else:
			
 
				+            aggfuncs_step2 = ['idxmin']
			
 
				+            
			
 
				+        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
			
 
				+                                                   value_col = value_col, 
			
 
				+                                                   aggfunc_step1 = aggfunc, 
			
 
				+                                                   aggfucs_step2 = aggfuncs_step2)
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
			
 
				+    
			
 
				+    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
			
 
				+    
			
 
				+    # write tests for all methods
			
--- a/cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
+++ b/cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
@@ -0,0 +1,77 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Nov  7 15:11:21 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
			
 
				+
			
 
				+
			
 
				+class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
			
 
				+    '''
			
 
				+    '''
			
 
				+    
			
 
				+    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
			
 
				+                                                                   index_cols = index_cols,
			
 
				+                                                                   date_col = date_col,
			
 
				+                                                                   split_date = split_date,
			
 
				+                                                                   period_length = n_periods*period_length,
			
 
				+                                                                   past_or_future = past_or_future,
			
 
				+                                                                   freq = freq,
			
 
				+                                                                   path_to_log)
			
 
				+        
			
 
				+        self.period_number_col = 'period_number'
			
 
				+        while period_number_col in data.columns:
			
 
				+            self.period_number_col += '&'
			
 
				+        
			
 
				+        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
			
 
				+                            .groupby(index_cols)[date_col].cumcount()\
			
 
				+                            .reset_index()\
			
 
				+                            .assign(period_number = lambda x: x[0]/period_length)\
			
 
				+                            .rename(columns = {'period_number' : self.period_number_col})
			
 
				+                                       
			
 
				+                
			
 
				+        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
			
 
				+                            
			
 
				+        self.initial_index_cols = self.index_cols.copy()
			
 
				+        self.index_cols.append(self.period_number_col)
			
 
				+        
			
 
				+        
			
 
				+    def _aggregate_over_time_periods(df):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return df.drop(self.period_number_col, axis = 1)\
			
 
				+                 .groupby(self.initial_index_cols)\
			
 
				+                 .mean()\
			
 
				+                 .reset_index()
			
 
				+        
			
 
				+        
			
 
				+    def get_kpis_by_aggregation(self, **args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                      .get_kpis_by_aggregation(**args))
			
 
				+            
			
 
				+            
			
 
				+    def get_value_stats(self, **args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                 .get_value_stats(**args))
			
 
				+        
			
 
				+        
			
 
				+    def get_aggregated_value_stats(self, args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                 .get_aggregated_value_stats(**args))
			
 
				+        
			
 
				+    
			
 
				+        
			
--- a/cdplib/feature_engineering/StatisticalFeaturesOverTime.py
+++ b/cdplib/feature_engineering/StatisticalFeaturesOverTime.py
@@ -0,0 +1,53 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Nov  7 14:02:18 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
			
 
				+from libraries.exception_handling import InputChecks, InputCasts
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+
			
 
				+class StatisticalFeaturesOverTime(StatisticalFeatures):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        configure_logging(path_to_log)
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        self.checks = InputChecks(logger = self.logger)
			
 
				+        self.casts = InputCasts(logger = self.logger)
			
 
				+        
			
 
				+        self.checks.assert_column_presence(data = data, colnames = [date_col])
			
 
				+        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
			
 
				+        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
			
 
				+        
			
 
				+        
			
 
				+        if past_or_future == 'past':
			
 
				+            if not period_length is None:
			
 
				+                min_date = split_date - pd.DateOffset(**{freq : period_length})
			
 
				+            else:
			
 
				+                min_date = data[date_col].min()
			
 
				+            sup_date = split_date
			
 
				+        else:
			
 
				+            min_date = split_date
			
 
				+            if not period_length is None:
			
 
				+                sup_date = split_date + pd.DateOffset(**{freq : period_length})
			
 
				+            else: 
			
 
				+                sup_date = split_date + pd.DateOffset(**{freq : 1})
			
 
				+            
			
 
				+        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
			
 
				+        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
			
 
				+        
			
 
				+            
			
 
				+        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
			
 
				+        
			
 
				+        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
			
 
				+                                                    index_cols = index_cols,
			
 
				+                                                    path_to_log = path_to_log)
			
--- a/cdplib/hyperopt/__init__.py
+++ b/cdplib/hyperopt/__init__.py
--- a/cdplib/unit_tests/TestFlattenData.py
+++ b/cdplib/unit_tests/TestFlattenData.py
--- a/cdplib/unit_tests/TestLog.py
+++ b/cdplib/unit_tests/TestLog.py
--- a/cdplib/unit_tests/TestMongodbHandler.py
+++ b/cdplib/unit_tests/TestMongodbHandler.py
--- a/cdplib/unit_tests/invalid_test_schema.json
+++ b/cdplib/unit_tests/invalid_test_schema.json
--- a/cdplib/unit_tests/valid_test_schema.json
+++ b/cdplib/unit_tests/valid_test_schema.json
--- a/cdplib/utils/__init__.py
+++ b/cdplib/utils/__init__.py