FlattenData.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Oct 9 15:17:34 2019
  5. @author: oskar
  6. @description: Class which flattens nested Dataframes, Dictionaries and Lists into tabular form
  7. """
  8. import sys
  9. import os
  10. import time
  11. import pandas as pd
  12. import copy
  13. sys.path.append(os.getcwd())
  14. from cdplib.log import Log
  15. class FlattenData():
  16. def __init__(self):
  17. self._log = Log("Flatten data")
  18. def flatten(self, data, labels_to_ignore: list = []) -> pd.DataFrame():
  19. '''
  20. :parm data: data given in either dictionary, list or dataframe format.
  21. '''
  22. assert(isinstance(data, (list, dict, pd.DataFrame, pd.Series))),\
  23. "Parameter 'data' either be of List, Dictionary or DataFrame type"
  24. in_length=0
  25. start = time.time()
  26. index_name=None
  27. if type(data) is pd.DataFrame:
  28. in_length = len(data.columns)
  29. index_name = data.index.name
  30. return_data = self.flatten_dataframe(data, labels_to_ignore=labels_to_ignore)
  31. elif type(data) is pd.Series:
  32. data = pd.DataFrame(data)
  33. in_length = len(data.columns)
  34. return_data = self.flatten_dataframe(data, labels_to_ignore=labels_to_ignore)
  35. elif type(data) is dict:
  36. in_length = len(data)
  37. return_data = self.flatten_dict(data, labels_to_ignore=labels_to_ignore)
  38. elif type(data) is list:
  39. in_length = len(data)
  40. return_data = self.flatten_list(data, labels_to_ignore=labels_to_ignore)
  41. else:
  42. self._log.log_and_raise_warning(("Input data type '{}' is not supported").format(type(data)))
  43. return None
  44. result_dataframe = pd.DataFrame.from_dict(return_data, orient='index')
  45. if index_name is not None:
  46. result_dataframe.index.name = index_name
  47. self._log.info(('Data has been flattened, created {} columns in {} seconds').format(len(result_dataframe.columns)- in_length, time.time()-start))
  48. return result_dataframe
  49. def flatten_dataframe(self, dataframe: pd.DataFrame, incoming_key: str = None, labels_to_ignore: list = []):
  50. '''
  51. :param pd.Dataframe dataframe: dataframe containing the data to be flattened
  52. :param str incoming_key: string to be appended to the key
  53. '''
  54. assert(isinstance(dataframe, pd.DataFrame)),\
  55. "Parameter 'dataframe' be of DataFrame type"
  56. if incoming_key is not None and not isinstance(incoming_key, str):
  57. incoming_key = str(incoming_key)
  58. result_dict = {}
  59. for index, row in dataframe.iterrows():
  60. temp_result_dict = {}
  61. for key, value in row.iteritems():
  62. if not isinstance(key, str):
  63. key = str(key)
  64. small_key = key
  65. if incoming_key is not None:
  66. key = incoming_key + '_' + key
  67. temp_result = {}
  68. if small_key in labels_to_ignore:
  69. temp_result_dict[key] = value
  70. else:
  71. if type(value) == list:
  72. temp_result = self.flatten_list(value, key, labels_to_ignore)
  73. elif type(value) == dict:
  74. temp_result = self.flatten_dict(value, key, labels_to_ignore)
  75. else:
  76. temp_result_dict[key] = value
  77. if len(temp_result) > 0:
  78. temp_result_dict = self.append_to_dict(temp_result_dict, temp_result)
  79. result_dict[index] = copy.deepcopy(temp_result_dict)
  80. return result_dict
  81. def flatten_dict(self, dictionary: dict, incoming_key: str = None, labels_to_ignore: list = []):
  82. '''
  83. :param dict dictionary: dictionary containing the data to be flattened
  84. :param str incoming_key: string to be appended to the key
  85. '''
  86. assert(isinstance(dictionary, dict)),\
  87. "Parameter 'dictionary' be of Dictionary type"
  88. if incoming_key is not None and not isinstance(incoming_key, str):
  89. incoming_key = str(incoming_key)
  90. result_dict = {}
  91. for key in dictionary:
  92. temp_data = dictionary[key]
  93. if not isinstance(key, str):
  94. key = str(key)
  95. small_key = key
  96. if incoming_key is not None:
  97. key = incoming_key + '_' + key
  98. temp_result = {}
  99. if small_key in labels_to_ignore:
  100. result_dict[key] = temp_data
  101. else:
  102. if type(temp_data) == list:
  103. temp_result = self.flatten_list(temp_data, key, labels_to_ignore)
  104. elif type(temp_data) == dict:
  105. temp_result = self.flatten_dict(temp_data, key, labels_to_ignore)
  106. else:
  107. result_dict[key] = temp_data
  108. if len(temp_result) > 0:
  109. result_dict = self.append_to_dict(result_dict, temp_result)
  110. return result_dict
  111. def flatten_list(self, data_list: list, incoming_key: str = None, labels_to_ignore: list = []):
  112. '''
  113. :param list data_list: list containing the data to be flattened
  114. :param str incoming_key: string to be appended to the key
  115. '''
  116. assert(isinstance(data_list, list)),\
  117. "Parameter 'data_list' be of List type"
  118. if incoming_key is not None and not isinstance(incoming_key, str):
  119. incoming_key = str(incoming_key)
  120. result_dict = {}
  121. for iteration, item in enumerate(data_list):
  122. temp_dataframe = item
  123. temp_result = {}
  124. key = incoming_key
  125. if not isinstance(key, str):
  126. key = str(key)
  127. if incoming_key is not None:
  128. list_iterator = self.add_list_iterator(data_list, iteration)
  129. key = incoming_key + '_' + list_iterator
  130. else:
  131. key = str(iteration)
  132. if type(temp_dataframe) == list:
  133. temp_result = self.flatten_list(temp_dataframe, key, labels_to_ignore)
  134. elif type(temp_dataframe) == dict:
  135. temp_result = self.flatten_dict(temp_dataframe, key, labels_to_ignore)
  136. else:
  137. result_dict[key] = temp_dataframe
  138. if len(temp_result) > 0:
  139. result_dict = self.append_to_dict(result_dict, temp_result)
  140. return result_dict
  141. def append_to_dict(self, dictionary: dict, to_append):
  142. '''
  143. :param dict dictionary: dictionary which holds all the resulting data.
  144. :param dict to_append: data to be added to the resulting dictionary.
  145. '''
  146. assert(isinstance(dictionary, dict)),\
  147. "Parameter 'dictionary' be of Dictionary type"
  148. assert(isinstance(to_append, dict)),\
  149. "Parameter 'to_append' be of Dictionary type"
  150. for key in to_append:
  151. dictionary[key] = to_append[key]
  152. return dictionary
  153. def flatten_if_not_flat(self, data: pd.DataFrame, labels_to_ignore: list = []):
  154. for data_type in data.dtypes:
  155. if data_type == object:
  156. return self.flatten(data, labels_to_ignore=labels_to_ignore)
  157. return data
  158. # Create class that inherits FlattenData and overwrite this function for specific implementations.
  159. def add_list_iterator(self, data_list: list, iteration: int):
  160. return str(iteration)