|
@@ -0,0 +1,177 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+Created on Thu Oct 18 16:26:47 2018
|
|
|
+
|
|
|
+@author: tanya
|
|
|
+"""
|
|
|
+
|
|
|
+import os
|
|
|
+import unittest
|
|
|
+import logging
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+
|
|
|
+from pandas.util.testing import assert_frame_equal
|
|
|
+
|
|
|
+from libraries.feature_engineering.in_memory_feature_engineering.StatisticalFeatures import StatisticalFeatures
|
|
|
+from libraries.logging.logging_utils import configure_logging
|
|
|
+
|
|
|
+
|
|
|
+class TestStatisticalFeatures(unittest.TestCase):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ def __init__(self, data = None, index_cols = None, path_to_log = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ if index_cols is None:
|
|
|
+ self.index_cols = ['id1', 'id2']
|
|
|
+ else:
|
|
|
+ self.index_cols = index_cols
|
|
|
+
|
|
|
+ if data is None:
|
|
|
+ self.data = pd.DataFrame({'int' : [1,2,3,2,55,3,7],
|
|
|
+ 'float' : [0.1, 7, 0.1, 99.9, 99.9, np.nan, 7],
|
|
|
+ 'str' : ['a', np.nan, 'c', 'a', 'a', '', 'c'],
|
|
|
+ 'datetime' : [pd.datetime(2017, 1, 2), np.nan, pd.datetime(2017, 5, 3), pd.datetime(2017, 1, 4),
|
|
|
+ '2018-01-19', pd.datetime(2018, 1, 4), pd.datetime(2019, 3, 23)],
|
|
|
+ 'nan' : [np.nan]*7,
|
|
|
+ 'id1' : [1,1,3,3,3,1,1],
|
|
|
+ 'id2' : ['a', 'a', 'b', 'b', 'a', 'a', np.nan]})\
|
|
|
+ .sort_values(by = self.index_cols)
|
|
|
+ else:
|
|
|
+ self.data = data
|
|
|
+
|
|
|
+
|
|
|
+ self.obj = StatisticalFeatures(data = self.data, index_cols = self.index_cols, path_to_log = path_to_log)
|
|
|
+
|
|
|
+class TestKpisByAggregation(TestStatisticalFeatures):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ def __init__(self, data = None, index_cols = None, path_to_log = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ super(TestKpisByAggregation, self).__init__(data = data, index_cols = index_cols, path_to_log = path_to_log)
|
|
|
+
|
|
|
+ def test_builtin_aggfuncs_numeric_cols(self, answer = None, kpis = None):
|
|
|
+ '''Tests the expected behaviour of pandas builtin aggregation function,
|
|
|
+ in particular behaviour with missing values
|
|
|
+
|
|
|
+ :param DataFrame data:
|
|
|
+ :param list index_cols:
|
|
|
+ :param DataFrame answer:
|
|
|
+ :param list of tuples or dict kpis:
|
|
|
+ '''
|
|
|
+ kpis = kpis or [('int', ['min', 'std']),
|
|
|
+ ('float', ['mean', np.sum]),
|
|
|
+ ('float', 'sum'),
|
|
|
+ ('nan', 'mean')]
|
|
|
+
|
|
|
+
|
|
|
+ answer = answer or pd.DataFrame([
|
|
|
+ {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0]), 'float_sum' : 7.1, 'nan_mean' : np.nan},
|
|
|
+ {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9]), 'float_sum' : 100, 'nan_mean' : np.nan},
|
|
|
+ {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9, 'float_sum' : 99.9, 'nan_mean' : np.nan},
|
|
|
+ ]).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ assert_frame_equal(result, answer[result.columns])
|
|
|
+
|
|
|
+
|
|
|
+ def test_dict_kpi(self, kpis = None, answer = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ kpis = kpis or {'int' : ['min', 'std'], 'float' : 'mean'}
|
|
|
+
|
|
|
+ answer = answer or pd.DataFrame([
|
|
|
+ {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0])},
|
|
|
+ {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9])},
|
|
|
+ {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9},
|
|
|
+ ]).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ assert_frame_equal(result, answer[result.columns])
|
|
|
+
|
|
|
+
|
|
|
+ def test_string_cols(self, kpis = None, answer = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ kpis = kpis or {'str' : ['sum']}
|
|
|
+
|
|
|
+ answer = answer or pd.DataFrame([
|
|
|
+ {'id1' : 1, 'id2' : 'a', 'str_sum' : 'anan'},
|
|
|
+ {'id1' : 3, 'id2' : 'b', 'str_sum' : 'ca'},
|
|
|
+ {'id1' : 3, 'id2' : 'a', 'str_sum' : 'a'},
|
|
|
+ ]).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ assert_frame_equal(result, answer[result.columns])
|
|
|
+
|
|
|
+
|
|
|
+ def test_custom_aggfunc(self, kpis, answer = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+
|
|
|
+ if kpis is None:
|
|
|
+ def custom_sum(x):
|
|
|
+ return np.sum(x)
|
|
|
+
|
|
|
+ kpis = {'int' : custom_sum}
|
|
|
+
|
|
|
+ answer = answer or pd.DataFrame([
|
|
|
+ {'id1' : 1, 'id2' : 'a', 'int_custom_sum' : 6},
|
|
|
+ {'id1' : 3, 'id2' : 'b', 'int_custom_sum' : 55},
|
|
|
+ {'id1' : 3, 'id2' : 'a', 'int_custom_sum' : 5},
|
|
|
+ ]).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ assert_frame_equal(result, answer[result.columns])
|
|
|
+
|
|
|
+
|
|
|
+ def test_some_wrong_col(self, kpis = None, answer = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ kpis = kpis or {'bla' : 'sum', 'int' : 'sum'}
|
|
|
+
|
|
|
+ answer = answer or pd.DataFrame([
|
|
|
+ {'id1' : 1, 'id2' : 'a', 'int_sum' : 6},
|
|
|
+ {'id1' : 3, 'id2' : 'a', 'int_sum' : 55},
|
|
|
+ {'id1' : 3, 'id2' : 'b', 'int_sum' : 5},
|
|
|
+ ]).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
|
|
|
+
|
|
|
+ assert_frame_equal(result, answer[result.columns])
|
|
|
+
|
|
|
+ def test_all_wrong_cols(self, kpis = None, answer = None):
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ kpis = kpis or {'bla' : 'sum', 'blub' : 'sum'}
|
|
|
+
|
|
|
+ result = self.obj.get_kpis_by_aggregation(kpis = kpis)
|
|
|
+
|
|
|
+ answer = self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
|
|
|
+
|
|
|
+ assert_frame_equal(result, answer[result.columns])
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+
|
|
|
+ path_to_log = os.path.join(os.environ.get('PROJECT_DIR'),
|
|
|
+ 'tests', 'test_feature_engineering','test_in_memory_feature_engineering',
|
|
|
+ 'test_kpis_by_aggregation.log')
|
|
|
+
|
|
|
+ configure_logging(path_to_log)
|
|
|
+ logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+ inst = TestKpisByAggregation(path_to_log = path_to_log)
|
|
|
+ inst.test_builtin_aggfuncs_numeric_cols()
|
|
|
+ inst.test_dict_kpi()
|
|
|
+ inst.test_string_cols()
|
|
|
+ inst.test_some_wrong_col()
|
|
|
+ inst.test_all_wrong_cols()
|
|
|
+
|
|
|
+ logger.info('Done testing method get_kpis_by_aggregation!')
|