tanja 3 vuotta sitten
vanhempi
commit
4109ebff20

+ 0 - 0
README.md


+ 0 - 0
cdplib/DataExplorer/DataExplorer.py


+ 0 - 0
cdplib/FlattenData.py


+ 0 - 0
cdplib/Singleton_Threadsafe.py


+ 0 - 0
cdplib/__init__.py


+ 117 - 0
cdplib/db_handlers/InfluxdbHandler.py

@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Feb 23 19:44:22 2021
+
+@author: tanya
+"""
+
+from cdplib.log import Log
+
+import pandas as pd
+
+from influxdb import DataFrameClient
+
+
+class InfluxdbHandler:
+    """
+    """
+    def __init__(self, database_url: str = None):
+        """
+        :param database_url: DESCRIPTION
+        :type database_url: str
+        :return: DESCRIPTION
+        :rtype: TYPE
+        """
+        self._logger = Log("InfluxdbHandler:")
+
+        if database_url is None:
+            database_url = self._read_url_from_env()
+
+        self.client = DataFrameClient.from_dsn(database_url)
+
+    def _read_url_from_env(self) -> str:
+        """
+        :return: parse database url from the configuration object.
+         configuration object is create by the script
+         /libraries.configuration.py and uses a configuration file
+         (by default .env)
+        :rtype: str
+
+        """
+        try:
+            from libraries.configuration import default as cfg
+
+            assert(cfg["INFLUX"] is not None),\
+                "configuration file must contain [INFLUX]"
+
+            assert(set(["INFLUX_HOST", "INFLUX_PORT", "INFLUX_DATABASE_NAME"])
+                   <= set(cfg["INFLUX"])),\
+                ("configuration file must contain influx host, ",
+                 " port, and database name")
+
+            database_url = "influxdb://"
+
+            if "INFLUX_USER" in cfg["INFLUX"]:
+                database_url += cfg["INFLUX"]["INFLUX_USER"]
+
+            if "INFLUX_PASSWORD" in cfg["INFLUX"]:
+                database_url += ":" + cfg["INFLUX"]["INFLUX_PASSWORD"]
+
+            database_url += "@{0}:{1}/{2}".format(
+                cfg["INFLUX"]["INFLUX_HOST"],
+                cfg["INFLUX"]["INFLUX_PORT"],
+                cfg["INFLUX"]["INFLUX_DATABASE_NAME"])
+
+            return database_url
+
+        except Exception as e:
+            self._logger.log_and_raise_error(
+                ("Could not parse url from configuration file. "
+                 "Exit with error {}".format(e)))
+
+    def query_to_dataframe(self, query: str) -> pd.DataFrame:
+        """
+        :param query: DESCRIPTION
+        :type query: str
+        :return: DESCRIPTION
+        :rtype: TYPE
+        """
+        try:
+            # result of the query is a defaultdict
+            result = self.client.query(query)
+
+            return list(result.values())[0]
+        except Exception as e:
+            self._logger.log_and_raise_error(
+                ("Could not query to dataframe. "
+                 "Exit with error {}".format(e)))
+
+    def query_between_dates(self, columns: str,
+                            tables: str,
+                            start: str,
+                            stop: str) -> pd.DataFrame:
+        """
+        :param columns: DESCRIPTION
+        :type columns: str
+        :param tables: DESCRIPTION
+        :type tables: str
+        :param start: DESCRIPTION
+        :type start: str
+        :param stop: DESCRIPTION
+        :type stop: str
+        :return: DESCRIPTION
+        :rtype: TYPE
+
+        """
+        query = 'SELECT ' +\
+                columns +\
+                ' FROM \"' +\
+                tables +\
+                '\" WHERE time > \'' +\
+                str(start) +\
+                '\' AND time  < \'' +\
+                str(stop) +\
+                '\' tz(\'Europe/Berlin\');'
+
+        return self.query_to_dataframe(query)

+ 0 - 0
cdplib/db_handlers/MongodbHandler.py


+ 0 - 0
cdplib/db_handlers/__init__.py


+ 0 - 0
cdplib/db_migration/DataFrameToCollection.py


+ 27 - 25
cdplib/db_migration/MigrationCleaning.py

@@ -255,9 +255,11 @@ class MigrationCleaning:
             columns = db.get_column_names(tablename=self._inconsist_report_table)
 
             if len(columns) > 0:
-                columns_not_in_data = [column for column in columns if column not in data.columns]
-                for value in columns_not_in_data:
-                    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
+                # TODO Tanya:The commented lines caused the reason to be the same for all entries.
+
+                #columns_not_in_data = [column for column in columns if column not in data.columns]
+                #for value in columns_not_in_data:
+                #    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
                 data_inconsist = data_inconsist[columns]
 
         db.append_to_table(data=data_inconsist,
@@ -396,7 +398,7 @@ class MigrationCleaning:
                     data[column] = data[column].astype(python_type)
 
                 elif python_type == float:
-                    
+
                     data[column] = data[column].fillna(np.inf)
                     # Replaces empty fields when type is string
                     if data[column].dtypes == object:
@@ -564,15 +566,15 @@ class MigrationCleaning:
         return data
 
     def clean_json_from_None_object(self, data: pd.DataFrame, clean_bool: bool = True) -> pd.DataFrame():
-        
+
         data = data.to_json(date_format="iso")
         data = json.loads(data)
         new_data = remap(data, lambda p, k, v: v is not None)
         new_data = remap(new_data, lambda p, k, v: v != 'None')
         new_data = remap(new_data, lambda p, k, v: v != 'inf')
-        # cleans not only bool type also int which are 0 or 1 
+        # cleans not only bool type also int which are 0 or 1
         # only use if it is necessary have to be change that it only considers
-        # Ture and False for bools 
+        # Ture and False for bools
         if clean_bool:
             new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
         return new_data
@@ -588,27 +590,27 @@ class MigrationCleaning:
 
 
     def map_toleranzen_values(self, data: pd.DataFrame, toleranzen: pd.DataFrame):
-        
+
         toleranzen.drop('nr', axis=1, inplace=True)
-        
+
         toleranzen.columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.geometrie.durchmesser.min', 'wellenschenkel.geometrie.durchmesser.max', 'innenring.geometrie.durchmesser.min',
                         'innenring.geometrie.durchmesser.max', 'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
 
-        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max', 
+        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max',
                                     'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
-        
+
         labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.geometrie.durchmesser.min', 'labyrinthring.geometrie.durchmesser.max']
-        
+
         reparatur_stufe_labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.reparatur_stufe.durchmesser.min', 'labyrinthring.reparatur_stufe.durchmesser.max']
 
-        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min', 
+        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min',
                                     'wellenschenkel.reparatur_stufe.durchmesser.max', 'innenring.reparatur_stufe.durchmesser.min',
-                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min', 
+                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min',
                                     'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.max']
 
-        
+
         toleranzen_reference_columns = ['wellenschenkel_toleranz', 'labyrinthring_toleranz', 'wellen_reparatur_stufe_toleranz', 'labyrinthring_reparatur_stufe_toleranz']
-        
+
         available_columns = [column for column in data.columns if column in toleranzen_reference_columns]
         for column in available_columns:
             merge_map = [False] *len(data.index)
@@ -623,13 +625,13 @@ class MigrationCleaning:
 
                     else:
                         temp_toleranzen.columns = labyrinten_columns
-                
+
                 elif 'reparatur_stufe' in column:
                     temp_toleranzen.columns = reparatur_stufe_columns
                     merge_map = data['innenring_reparatur_stufe_zulaessig'] == 'Ja'
                 data_before = len(data.index)
                 data = data.merge(temp_toleranzen, how='left', left_on=column, right_on='toleranzbez_wellen_reference')
-                data.loc[merge_map, temp_toleranzen.columns] = np.nan 
+                data.loc[merge_map, temp_toleranzen.columns] = np.nan
                 if data_before != len(data.index):
                     print('WEVE LOST DATA!!')
                     print('before:', data_before, 'now:', len(data.index))
@@ -641,9 +643,9 @@ class MigrationCleaning:
 
     def label_is_level(
                     self,
-                    data: pd.DataFrame, 
-                    column: str = "is", 
-                    include_schrott: bool = False, 
+                    data: pd.DataFrame,
+                    column: str = "is",
+                    include_schrott: bool = False,
                     drop_rows_with_no_is: bool = False) -> pd.DataFrame:
         '''
         '''
@@ -659,16 +661,16 @@ class MigrationCleaning:
                 data.loc[data[column].isin(v), column] = k
             else:
                 data.loc[data[column].isnull(), column] = k
-        
+
         if include_schrott and ("operation_type_2" in data.columns):
             schrott_mask = (data["operation_type_2"] == 2)
             data.loc[schrott_mask, column] = 5
-        
+
         data.loc[~data[column].isin([0,1,2,3,4,5]), column] = 0
-                    
+
         if drop_rows_with_no_is:
             data = data.loc[data[column] != 0].copy(deep=True)
-            
+
         return data.reset_index(drop=True)
 
 

+ 0 - 0
cdplib/db_migration/ParseDbSchema.py


+ 0 - 0
cdplib/db_migration/ParseJsonSchema.py


+ 0 - 0
cdplib/db_migration/ParseMapping.py


+ 270 - 0
cdplib/feature_engineering/StatisticalFeatures.py

@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" 
+Created on Tue Oct 16 16:08:47 2018
+
+@author: tanya
+"""
+import types
+import logging
+import pandas as pd
+
+from collections import defaultdict
+from functools import reduce
+
+from libraries.logging.logging_utils import configure_logging
+from libraries.exception_handling import InputChecks
+          
+class StatisticalFeatures:
+    '''
+    Groups data by index columns and returns aggregated statistics for given columns
+    
+    :param list of tuples or dict index_cols: 
+        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
+                                             (colname_2, aggfunc_3)]
+        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
+        where colname_i is column to aggregate and aggfunc_i are either 
+        function variables or strings accepted by pandas for built-in function names.
+        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
+        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
+        then only the first one is kept.
+        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
+        
+    '''
+    def __init__(self, data, index_cols, path_to_log = None):
+        '''
+        '''
+        configure_logging(path_to_log)
+            
+        self.logger = logging.getLogger(__name__)
+        
+        self.checks = InputChecks(logger = self.logger)
+        
+        self.data = data
+        
+        self.checks.assert_correct_type({'data', [pd.DataFrame]})
+            
+        self.index_cols = index_cols
+        
+        # make warning about missing values in index columns
+        for col in self.index_cols:
+            if data[col].isnull().any():
+                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
+
+        
+    def get_kpis_by_aggregation(self, kpis):
+        '''
+        Aggregates given fields with given aggregation functions
+         USE CASE: per product find mean and standard variation of a price
+        
+        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
+         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
+         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
+         
+        :return: features with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        def get_valid_agg_dict_from_kpis(kpis):
+            '''
+            Filters inputs of incorrect shape or type,
+            Filters out columns not present in data
+            Removes multiple functions with the same name
+            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
+            Reports to the log
+            :param list or dict kpis:
+            '''
+            def get_name(x):
+                '''
+                Returns function name for function and does nothing for string
+                '''
+                if isinstance(x, types.FunctionType):
+                    return x.__name__
+                else:
+                    return x
+                
+            def passed_first_line_type_control(col, aggfunc):
+                '''
+                Checks if aggregation works on the first 2 lines of the data
+                '''
+                try:
+                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
+                    self.data.iloc[:2]\
+                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
+                             .groupby(self.index_cols)\
+                             .agg({col : aggfunc})
+                    return True
+                except Exception as e:
+                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
+                    return False
+           
+            
+            
+            valid_kpi_dict = defaultdict(list)
+            
+            if isinstance(kpis, list):
+                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
+                if sum(incorrect_lengths) > 0:
+                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
+                
+                cols = list(zip(*kpis))[0]             
+                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
+            elif isinstance(kpis, dict):
+                cols = list(kpis.keys())
+                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
+                
+            cols_not_in_data = set(cols) - set(self.data.columns)
+            if len(cols_not_in_data) > 0:
+                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
+                
+            for col, aggfuncs in kpis:
+                if not isinstance(aggfuncs, list):
+                    aggfuncs = [aggfuncs]
+                
+                for aggfunc in aggfuncs:
+                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
+                    if not is_new_funcname:
+                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
+                    
+                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
+                        valid_kpi_dict[col].append(aggfunc)
+                    
+            return valid_kpi_dict
+                   
+        
+        
+        
+        agg_dict = get_valid_agg_dict_from_kpis(kpis)
+        
+        if len(agg_dict) > 0:
+        
+            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
+                             else '_'.join([col, str(aggfunc)]) 
+                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
+            
+            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
+            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
+                       .groupby(self.index_cols)\
+                       .agg(agg_dict)\
+                       .set_axis(new_names, axis = 'columns', inplace = False)\
+                       .reset_index()
+        else:
+            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
+        
+        
+        
+        
+        
+        
+        
+    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
+        '''
+        A wrapper crosstab method with index equal to index_cols
+        USE CASE: per product find standart variation of the price in each city
+        
+        :param str pivot_col: column values of which become columns in the output
+        :param str value_col: column name to fillin vlaues
+        :param str or func aggfunc: count if None
+        :param list entries: values of pivot_col to show
+        :return: table with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        
+        # assert that types of the inputs are correct
+        types_to_check = {'columns' : [str], 
+                          'value_col' : [str, type(None)],  
+                          'aggfunc' : ['str', types.FunctionType, type(None)], 
+                          'entries' : [list, type(None)]}
+        
+        self.checks.assert_correct_type(types_to_check)
+        
+        cols_to_check = [pivot_col]
+        if not value_col is None:
+            cols_to_check.append(value_col)
+        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
+
+        if not entries is None:
+            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
+        else:
+            entry_filter = pd.Series([True]*len(self.data))              
+    
+        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
+        columns = self.data.loc[entry_filter, pivot_col]
+        if not value_col is None:
+            value_col = self.data.loc[entry_filter, value_col]
+                        
+        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
+        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
+                       .reset_index()
+        return result
+    
+
+
+
+
+        
+    
+    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
+        '''
+        Aggregates values obtained with method get_value_stats
+         USE CASE: per product find average variation of the price over all cities
+         
+        :param str pivot_col:
+        :param str value_col:
+        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
+        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
+        :param list entries: 
+        :return: table with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
+        
+        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
+
+        result = value_stat_kpis[self.index_cols].copy(deep = True)
+        
+        for aggfunc in aggfuncs_step2:
+            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
+            
+            if isinstance(aggfunc, str):
+                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
+            else:
+                result[colname] = value_stat_kpis.set_index(self.index_cols)\
+                                                 .apply(aggfunc, axis = 1)\
+                                                 .reset_index(drop = True)
+                                                 
+        return result
+                              
+                              
+                              
+                              
+                                                            
+    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
+        '''
+        Finds argmin or argmax of a column
+         USE CASE: per product find the city with maximum variation of the price
+        
+        :param str min_or_max: must be in ['min', 'max']
+        :param str pivot_col:
+        :param str value_col:
+        :param str aggfunc:    
+        '''
+        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
+        
+        if min_or_max == 'max':
+            aggfuncs_step2 = ['idxmax']
+        else:
+            aggfuncs_step2 = ['idxmin']
+            
+        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
+                                                   value_col = value_col, 
+                                                   aggfunc_step1 = aggfunc, 
+                                                   aggfucs_step2 = aggfuncs_step2)
+        
+        
+        
+        
+    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
+    
+    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
+    
+    # write tests for all methods

+ 77 - 0
cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py

@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  7 15:11:21 2018
+
+@author: tanya
+"""
+
+import pandas as pd
+
+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
+
+
+class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
+    '''
+    '''
+    
+    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
+        '''
+        '''
+        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
+                                                                   index_cols = index_cols,
+                                                                   date_col = date_col,
+                                                                   split_date = split_date,
+                                                                   period_length = n_periods*period_length,
+                                                                   past_or_future = past_or_future,
+                                                                   freq = freq,
+                                                                   path_to_log)
+        
+        self.period_number_col = 'period_number'
+        while period_number_col in data.columns:
+            self.period_number_col += '&'
+        
+        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
+                            .groupby(index_cols)[date_col].cumcount()\
+                            .reset_index()\
+                            .assign(period_number = lambda x: x[0]/period_length)\
+                            .rename(columns = {'period_number' : self.period_number_col})
+                                       
+                
+        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
+                            
+        self.initial_index_cols = self.index_cols.copy()
+        self.index_cols.append(self.period_number_col)
+        
+        
+    def _aggregate_over_time_periods(df):
+        '''
+        '''
+        return df.drop(self.period_number_col, axis = 1)\
+                 .groupby(self.initial_index_cols)\
+                 .mean()\
+                 .reset_index()
+        
+        
+    def get_kpis_by_aggregation(self, **args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                      .get_kpis_by_aggregation(**args))
+            
+            
+    def get_value_stats(self, **args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                 .get_value_stats(**args))
+        
+        
+    def get_aggregated_value_stats(self, args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                 .get_aggregated_value_stats(**args))
+        
+    
+        

+ 53 - 0
cdplib/feature_engineering/StatisticalFeaturesOverTime.py

@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  7 14:02:18 2018
+
+@author: tanya
+"""
+
+import logging
+import pandas as pd
+
+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
+from libraries.exception_handling import InputChecks, InputCasts
+from libraries.logging.logging_utils import configure_logging
+
+class StatisticalFeaturesOverTime(StatisticalFeatures):
+    '''
+    '''
+    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
+        '''
+        '''
+        configure_logging(path_to_log)
+        self.logger = logging.getLogger(__name__)
+        self.checks = InputChecks(logger = self.logger)
+        self.casts = InputCasts(logger = self.logger)
+        
+        self.checks.assert_column_presence(data = data, colnames = [date_col])
+        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
+        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
+        
+        
+        if past_or_future == 'past':
+            if not period_length is None:
+                min_date = split_date - pd.DateOffset(**{freq : period_length})
+            else:
+                min_date = data[date_col].min()
+            sup_date = split_date
+        else:
+            min_date = split_date
+            if not period_length is None:
+                sup_date = split_date + pd.DateOffset(**{freq : period_length})
+            else: 
+                sup_date = split_date + pd.DateOffset(**{freq : 1})
+            
+        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
+        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
+        
+            
+        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
+        
+        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
+                                                    index_cols = index_cols,
+                                                    path_to_log = path_to_log)

+ 0 - 0
cdplib/hyperopt/__init__.py


+ 0 - 0
cdplib/unit_tests/TestFlattenData.py


+ 0 - 0
cdplib/unit_tests/TestLog.py


+ 0 - 0
cdplib/unit_tests/TestMongodbHandler.py


+ 0 - 0
cdplib/unit_tests/invalid_test_schema.json


+ 0 - 0
cdplib/unit_tests/valid_test_schema.json


+ 0 - 0
cdplib/utils/__init__.py