#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Oct 18 16:26:47 2018 @author: tanya """ import os import unittest import logging import pandas as pd import numpy as np from pandas.util.testing import assert_frame_equal from libraries.feature_engineering.in_memory_feature_engineering.StatisticalFeatures import StatisticalFeatures from libraries.logging.logging_utils import configure_logging class TestStatisticalFeatures(unittest.TestCase): ''' ''' def __init__(self, data = None, index_cols = None, path_to_log = None): ''' ''' if index_cols is None: self.index_cols = ['id1', 'id2'] else: self.index_cols = index_cols if data is None: self.data = pd.DataFrame({'int' : [1,2,3,2,55,3,7], 'float' : [0.1, 7, 0.1, 99.9, 99.9, np.nan, 7], 'str' : ['a', np.nan, 'c', 'a', 'a', '', 'c'], 'datetime' : [pd.datetime(2017, 1, 2), np.nan, pd.datetime(2017, 5, 3), pd.datetime(2017, 1, 4), '2018-01-19', pd.datetime(2018, 1, 4), pd.datetime(2019, 3, 23)], 'nan' : [np.nan]*7, 'id1' : [1,1,3,3,3,1,1], 'id2' : ['a', 'a', 'b', 'b', 'a', 'a', np.nan]})\ .sort_values(by = self.index_cols) else: self.data = data self.obj = StatisticalFeatures(data = self.data, index_cols = self.index_cols, path_to_log = path_to_log) class TestKpisByAggregation(TestStatisticalFeatures): ''' ''' def __init__(self, data = None, index_cols = None, path_to_log = None): ''' ''' super(TestKpisByAggregation, self).__init__(data = data, index_cols = index_cols, path_to_log = path_to_log) def test_builtin_aggfuncs_numeric_cols(self, answer = None, kpis = None): '''Tests the expected behaviour of pandas builtin aggregation function, in particular behaviour with missing values :param DataFrame data: :param list index_cols: :param DataFrame answer: :param list of tuples or dict kpis: ''' kpis = kpis or [('int', ['min', 'std']), ('float', ['mean', np.sum]), ('float', 'sum'), ('nan', 'mean')] answer = answer or pd.DataFrame([ {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0]), 'float_sum' : 7.1, 'nan_mean' : np.nan}, {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9]), 'float_sum' : 100, 'nan_mean' : np.nan}, {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9, 'float_sum' : 99.9, 'nan_mean' : np.nan}, ]).sort_values(self.index_cols).set_index(self.index_cols) result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols) assert_frame_equal(result, answer[result.columns]) def test_dict_kpi(self, kpis = None, answer = None): ''' ''' kpis = kpis or {'int' : ['min', 'std'], 'float' : 'mean'} answer = answer or pd.DataFrame([ {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0])}, {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9])}, {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9}, ]).sort_values(self.index_cols).set_index(self.index_cols) result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols) assert_frame_equal(result, answer[result.columns]) def test_string_cols(self, kpis = None, answer = None): ''' ''' kpis = kpis or {'str' : ['sum']} answer = answer or pd.DataFrame([ {'id1' : 1, 'id2' : 'a', 'str_sum' : 'anan'}, {'id1' : 3, 'id2' : 'b', 'str_sum' : 'ca'}, {'id1' : 3, 'id2' : 'a', 'str_sum' : 'a'}, ]).sort_values(self.index_cols).set_index(self.index_cols) result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols) assert_frame_equal(result, answer[result.columns]) def test_custom_aggfunc(self, kpis, answer = None): ''' ''' if kpis is None: def custom_sum(x): return np.sum(x) kpis = {'int' : custom_sum} answer = answer or pd.DataFrame([ {'id1' : 1, 'id2' : 'a', 'int_custom_sum' : 6}, {'id1' : 3, 'id2' : 'b', 'int_custom_sum' : 55}, {'id1' : 3, 'id2' : 'a', 'int_custom_sum' : 5}, ]).sort_values(self.index_cols).set_index(self.index_cols) result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols) assert_frame_equal(result, answer[result.columns]) def test_some_wrong_col(self, kpis = None, answer = None): ''' ''' kpis = kpis or {'bla' : 'sum', 'int' : 'sum'} answer = answer or pd.DataFrame([ {'id1' : 1, 'id2' : 'a', 'int_sum' : 6}, {'id1' : 3, 'id2' : 'a', 'int_sum' : 55}, {'id1' : 3, 'id2' : 'b', 'int_sum' : 5}, ]).sort_values(self.index_cols).set_index(self.index_cols) result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols) assert_frame_equal(result, answer[result.columns]) def test_all_wrong_cols(self, kpis = None, answer = None): ''' ''' kpis = kpis or {'bla' : 'sum', 'blub' : 'sum'} result = self.obj.get_kpis_by_aggregation(kpis = kpis) answer = self.data[self.index_cols].drop_duplicates().reset_index(drop = True) assert_frame_equal(result, answer[result.columns]) if __name__ == '__main__': path_to_log = os.path.join(os.environ.get('PROJECT_DIR'), 'tests', 'test_feature_engineering','test_in_memory_feature_engineering', 'test_kpis_by_aggregation.log') configure_logging(path_to_log) logger = logging.getLogger(__name__) inst = TestKpisByAggregation(path_to_log = path_to_log) inst.test_builtin_aggfuncs_numeric_cols() inst.test_dict_kpi() inst.test_string_cols() inst.test_some_wrong_col() inst.test_all_wrong_cols() logger.info('Done testing method get_kpis_by_aggregation!')