5 years ago · e8c4acf7c1
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
@@ -10,8 +10,10 @@ Created on Mon Jul 22 11:05:47 2019
 
																 """
															
 
																 import pandas as pd
															
 
																+import numpy as np
															
 
																 import os
															
 
																 import sys
															
 
																+import time
															
 
																 sys.path.append(os.getcwd())
															
@@ -67,10 +69,11 @@ class DataFrameToCollection():
 
																          grp_fields and reshape it accordingly, the result is a pandas Series.
															
 
																          In the end all the series are collected and concatenated.
															
 
																         '''
															
 
																-        from copy import deepcopy
															
 
																+        from copy import deepcopy
															
 
																+        
															
 
																         data = self._melt_duplicated_columns(data)
															
 
																-
															
 
																+        
															
 
																         reshaped_fields = []
															
 
																         if schema is None:
															
@@ -87,10 +90,10 @@ class DataFrameToCollection():
 
																             if field_type not in ["array", "object"]:
															
 
																                 grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																-
															
 
																+                
															
 
																                 # check that there is only one possible value of this field
															
 
																                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
															
 
																-
															
 
																+                
															
 
																                 if n_distinct_values != 1:
															
 
																                     err = "Field {0} is not unique with respect to {1}"\
															
 
																                           .format(field, grp_fields)
															
@@ -111,7 +114,7 @@ class DataFrameToCollection():
 
																             elif field_type == "object":
															
 
																                 sub_schema = deepcopy(schema["properties"][field])
															
 
																-            
															
 
																+                
															
 
																                 # rename sub-schema properties to match with data column names
															
 
																                 sub_schema["properties"] =\
															
 
																                     {".".join([field, k]): v for k, v
															
@@ -125,6 +128,7 @@ class DataFrameToCollection():
 
																                 # Need to be checked since child elements can be empty
															
 
																                 if sub_data is not None:
															
 
																+                    
															
 
																                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
															
 
																                     reshaped_field.name = field
															
@@ -133,23 +137,23 @@ class DataFrameToCollection():
 
																             # if field is a list of dictionaries
															
 
																             elif field_type == "array":
															
 
																-
															
 
																+               
															
 
																                 items_type = schema["properties"][field]["items"]["bsonType"]
															
 
																                 if items_type == "object":
															
 
																-
															
 
																+                    array_object = time.time()
															
 
																                     sub_schema = deepcopy(schema["properties"][field]["items"])
															
 
																                     # rename sub-schema properties to match data column names
															
 
																                     sub_schema["properties"] =\
															
 
																                         {".".join([field, k]): v for k, v in
															
 
																                          sub_schema["properties"].items()}
															
 
																-
															
 
																+                    
															
 
																                     # extend grp fields by sub-fields of field simple types
															
 
																                     sub_grp_fields = [f for f in sub_schema["properties"]
															
 
																                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
															
 
																                                       and (f in data.columns)]
															
 
																-
															
 
																+                    
															
 
																                     if len(sub_grp_fields) == 0:
															
 
																                         err = ("One of the sub-keys in a list of documents"
															
 
																                                " must be of simple type for the field {}"
															
@@ -157,31 +161,32 @@ class DataFrameToCollection():
 
																                         self._log.error(err)
															
 
																                         raise Exception(err)
															
 
																-                        
															
 
																+                    
															
 
																                     # group and reshape sub-fields with complex types
															
 
																                     sub_data = self.to_list_of_documents(
															
 
																                                 data=data,
															
 
																                                 schema=sub_schema,
															
 
																                                 grp_fields=grp_fields + sub_grp_fields,
															
 
																                                 _final_step=False)
															
 
																-
															
 
																+                    
															
 
																                     if sub_data is not None:
															
 
																-
															
 
																+                        
															
 
																                         # gether the results into a list of dictionaries
															
 
																                         sub_data = sub_data.apply(self._make_dict, axis=1)
															
 
																-
															
 
																+                        
															
 
																                         sub_data.name = field
															
 
																                         sub_data = sub_data.reset_index(grp_fields)
															
 
																-
															
 
																+                        ######################################################
															
 
																+                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
															
 
																                         reshaped_field =\
															
 
																                             sub_data.groupby(grp_fields, sort=False)[field]\
															
 
																                                     .apply(self._make_list_of_distinct)
															
 
																-
															
 
																+                        ######################################################
															
 
																                         reshaped_fields.append(reshaped_field)
															
 
																-
															
 
																+                        
															
 
																+                    
															
 
																                 # if field is a list of values with simple type
															
 
																                 elif items_type == "array":
															
 
																-
															
 
																                     grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																                     if field in data.columns:
															
@@ -190,7 +195,6 @@ class DataFrameToCollection():
 
																                                              .apply(self._make_list_of_distinct)
															
 
																                         reshaped_fields.append(reshaped_field)
															
 
																-
															
 
																                 else:
															
 
																                     grp_fields = [c for c in grp_fields if c in data.columns]
															
@@ -203,9 +207,9 @@ class DataFrameToCollection():
 
																                         reshaped_fields.append(reshaped_field)
															
 
																         if len(reshaped_fields) > 0:
															
 
																-
															
 
																+            
															
 
																             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
															
 
																-
															
 
																+            
															
 
																             if _final_step:
															
 
																                 # dropping the index names if it is the final step,
															
 
																                 # if not the index is needed for merging
															
@@ -214,7 +218,7 @@ class DataFrameToCollection():
 
																                                    .reset_index(drop=False)
															
 
																                 self._log.info("Done reshaping the dataframe to a list of documents")
															
 
																-
															
 
																+                
															
 
																             return reshaped_fields
															
 
																         else:
															
@@ -267,18 +271,28 @@ class DataFrameToCollection():
 
																          entries are arbitrary objects
															
 
																          (pandas unique() method does not work if entries are of complex types)
															
 
																         '''
															
 
																-        uniques = pd.DataFrame({"temp": x.tolist()})\
															
 
																-                    .assign(temp_str=lambda y: y["temp"].astype(str))\
															
 
																+            
															
 
																+    
															
 
																+        if x.size == 1:
															
 
																+            return_value = x.tolist()
															
 
																+            if return_value == [{}]:
															
 
																+                return []
															
 
																+            return return_value
															
 
																+        
															
 
																+
															
 
																+        uniques = pd.DataFrame({"temp": x.values})\
															
 
																+                    .assign(temp_str=lambda y: y["temp"].astype(np.str))\
															
 
																                     .drop_duplicates(subset=["temp_str"])\
															
 
																                     .drop("temp_str", axis=1).iloc[:, 0].tolist()
															
 
																-
															
 
																+        
															
 
																+    
															
 
																         def is_empty(y):
															
 
																             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
															
 
																             is_empty_list = (isinstance(y, list) and (len(y) == 0))
															
 
																             return is_empty_dict or is_empty_list
															
 
																         return [el for el in uniques if not is_empty(el)]
															
 
																-
															
 
																+        
															
 
																     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
															
 
																         '''
															
 
																         return: list of unique values from a Series where