123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Thu Oct 18 16:26:47 2018
- @author: tanya
- """
- import os
- import unittest
- import logging
- import pandas as pd
- import numpy as np
- from pandas.util.testing import assert_frame_equal
- from libraries.feature_engineering.in_memory_feature_engineering.StatisticalFeatures import StatisticalFeatures
- from libraries.logging.logging_utils import configure_logging
- class TestStatisticalFeatures(unittest.TestCase):
- '''
- '''
- def __init__(self, data = None, index_cols = None, path_to_log = None):
- '''
- '''
- if index_cols is None:
- self.index_cols = ['id1', 'id2']
- else:
- self.index_cols = index_cols
-
- if data is None:
- self.data = pd.DataFrame({'int' : [1,2,3,2,55,3,7],
- 'float' : [0.1, 7, 0.1, 99.9, 99.9, np.nan, 7],
- 'str' : ['a', np.nan, 'c', 'a', 'a', '', 'c'],
- 'datetime' : [pd.datetime(2017, 1, 2), np.nan, pd.datetime(2017, 5, 3), pd.datetime(2017, 1, 4),
- '2018-01-19', pd.datetime(2018, 1, 4), pd.datetime(2019, 3, 23)],
- 'nan' : [np.nan]*7,
- 'id1' : [1,1,3,3,3,1,1],
- 'id2' : ['a', 'a', 'b', 'b', 'a', 'a', np.nan]})\
- .sort_values(by = self.index_cols)
- else:
- self.data = data
-
-
- self.obj = StatisticalFeatures(data = self.data, index_cols = self.index_cols, path_to_log = path_to_log)
-
- class TestKpisByAggregation(TestStatisticalFeatures):
- '''
- '''
- def __init__(self, data = None, index_cols = None, path_to_log = None):
- '''
- '''
- super(TestKpisByAggregation, self).__init__(data = data, index_cols = index_cols, path_to_log = path_to_log)
-
- def test_builtin_aggfuncs_numeric_cols(self, answer = None, kpis = None):
- '''Tests the expected behaviour of pandas builtin aggregation function,
- in particular behaviour with missing values
-
- :param DataFrame data:
- :param list index_cols:
- :param DataFrame answer:
- :param list of tuples or dict kpis:
- '''
- kpis = kpis or [('int', ['min', 'std']),
- ('float', ['mean', np.sum]),
- ('float', 'sum'),
- ('nan', 'mean')]
-
-
- answer = answer or pd.DataFrame([
- {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0]), 'float_sum' : 7.1, 'nan_mean' : np.nan},
- {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9]), 'float_sum' : 100, 'nan_mean' : np.nan},
- {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9, 'float_sum' : 99.9, 'nan_mean' : np.nan},
- ]).sort_values(self.index_cols).set_index(self.index_cols)
-
- result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
-
- assert_frame_equal(result, answer[result.columns])
-
-
- def test_dict_kpi(self, kpis = None, answer = None):
- '''
- '''
- kpis = kpis or {'int' : ['min', 'std'], 'float' : 'mean'}
-
- answer = answer or pd.DataFrame([
- {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0])},
- {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9])},
- {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9},
- ]).sort_values(self.index_cols).set_index(self.index_cols)
-
- result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
-
- assert_frame_equal(result, answer[result.columns])
-
-
- def test_string_cols(self, kpis = None, answer = None):
- '''
- '''
- kpis = kpis or {'str' : ['sum']}
-
- answer = answer or pd.DataFrame([
- {'id1' : 1, 'id2' : 'a', 'str_sum' : 'anan'},
- {'id1' : 3, 'id2' : 'b', 'str_sum' : 'ca'},
- {'id1' : 3, 'id2' : 'a', 'str_sum' : 'a'},
- ]).sort_values(self.index_cols).set_index(self.index_cols)
-
- result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
-
- assert_frame_equal(result, answer[result.columns])
-
-
- def test_custom_aggfunc(self, kpis, answer = None):
- '''
- '''
-
- if kpis is None:
- def custom_sum(x):
- return np.sum(x)
-
- kpis = {'int' : custom_sum}
-
- answer = answer or pd.DataFrame([
- {'id1' : 1, 'id2' : 'a', 'int_custom_sum' : 6},
- {'id1' : 3, 'id2' : 'b', 'int_custom_sum' : 55},
- {'id1' : 3, 'id2' : 'a', 'int_custom_sum' : 5},
- ]).sort_values(self.index_cols).set_index(self.index_cols)
-
- result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
-
- assert_frame_equal(result, answer[result.columns])
-
-
- def test_some_wrong_col(self, kpis = None, answer = None):
- '''
- '''
- kpis = kpis or {'bla' : 'sum', 'int' : 'sum'}
-
- answer = answer or pd.DataFrame([
- {'id1' : 1, 'id2' : 'a', 'int_sum' : 6},
- {'id1' : 3, 'id2' : 'a', 'int_sum' : 55},
- {'id1' : 3, 'id2' : 'b', 'int_sum' : 5},
- ]).sort_values(self.index_cols).set_index(self.index_cols)
-
- result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
-
- assert_frame_equal(result, answer[result.columns])
-
- def test_all_wrong_cols(self, kpis = None, answer = None):
- '''
- '''
- kpis = kpis or {'bla' : 'sum', 'blub' : 'sum'}
-
- result = self.obj.get_kpis_by_aggregation(kpis = kpis)
-
- answer = self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
-
- assert_frame_equal(result, answer[result.columns])
-
- if __name__ == '__main__':
-
- path_to_log = os.path.join(os.environ.get('PROJECT_DIR'),
- 'tests', 'test_feature_engineering','test_in_memory_feature_engineering',
- 'test_kpis_by_aggregation.log')
-
- configure_logging(path_to_log)
- logger = logging.getLogger(__name__)
- inst = TestKpisByAggregation(path_to_log = path_to_log)
- inst.test_builtin_aggfuncs_numeric_cols()
- inst.test_dict_kpi()
- inst.test_string_cols()
- inst.test_some_wrong_col()
- inst.test_all_wrong_cols()
-
- logger.info('Done testing method get_kpis_by_aggregation!')
|