Przeglądaj źródła

Optimizations, in _make_list_of_distinct return value right away if it only contains one value

ogert 5 lat temu
rodzic
commit
e8c4acf7c1
1 zmienionych plików z 39 dodań i 25 usunięć
  1. 39 25
      cdplib/db_migration/DataFrameToCollection.py

+ 39 - 25
cdplib/db_migration/DataFrameToCollection.py

@@ -10,8 +10,10 @@ Created on Mon Jul 22 11:05:47 2019
 """
 
 import pandas as pd
+import numpy as np
 import os
 import sys
+import time
 
 sys.path.append(os.getcwd())
 
@@ -67,10 +69,11 @@ class DataFrameToCollection():
          grp_fields and reshape it accordingly, the result is a pandas Series.
          In the end all the series are collected and concatenated.
         '''
-        from copy import deepcopy
 
+        from copy import deepcopy
+        
         data = self._melt_duplicated_columns(data)
-
+        
         reshaped_fields = []
 
         if schema is None:
@@ -87,10 +90,10 @@ class DataFrameToCollection():
             if field_type not in ["array", "object"]:
 
                 grp_fields = [c for c in grp_fields if c in data.columns]
-
+                
                 # check that there is only one possible value of this field
                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
-
+                
                 if n_distinct_values != 1:
                     err = "Field {0} is not unique with respect to {1}"\
                           .format(field, grp_fields)
@@ -111,7 +114,7 @@ class DataFrameToCollection():
             elif field_type == "object":
 
                 sub_schema = deepcopy(schema["properties"][field])
-            
+                
                 # rename sub-schema properties to match with data column names
                 sub_schema["properties"] =\
                     {".".join([field, k]): v for k, v
@@ -125,6 +128,7 @@ class DataFrameToCollection():
                 
                 # Need to be checked since child elements can be empty
                 if sub_data is not None:
+                    
                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
                     reshaped_field.name = field
     
@@ -133,23 +137,23 @@ class DataFrameToCollection():
             # if field is a list of dictionaries
             elif field_type == "array":
              
-
+               
                 items_type = schema["properties"][field]["items"]["bsonType"]
 
                 if items_type == "object":
-
+                    array_object = time.time()
                     sub_schema = deepcopy(schema["properties"][field]["items"])
 
                     # rename sub-schema properties to match data column names
                     sub_schema["properties"] =\
                         {".".join([field, k]): v for k, v in
                          sub_schema["properties"].items()}
-
+                    
                     # extend grp fields by sub-fields of field simple types
                     sub_grp_fields = [f for f in sub_schema["properties"]
                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
                                       and (f in data.columns)]
-
+                    
                     if len(sub_grp_fields) == 0:
                         err = ("One of the sub-keys in a list of documents"
                                " must be of simple type for the field {}"
@@ -157,31 +161,32 @@ class DataFrameToCollection():
 
                         self._log.error(err)
                         raise Exception(err)
-                        
+                    
                     # group and reshape sub-fields with complex types
                     sub_data = self.to_list_of_documents(
                                 data=data,
                                 schema=sub_schema,
                                 grp_fields=grp_fields + sub_grp_fields,
                                 _final_step=False)
-
+                    
                     if sub_data is not None:
-
+                        
                         # gether the results into a list of dictionaries
                         sub_data = sub_data.apply(self._make_dict, axis=1)
-
+                        
                         sub_data.name = field
                         sub_data = sub_data.reset_index(grp_fields)
-
+                        ######################################################
+                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
                         reshaped_field =\
                             sub_data.groupby(grp_fields, sort=False)[field]\
                                     .apply(self._make_list_of_distinct)
-
+                        ######################################################
                         reshaped_fields.append(reshaped_field)
-
+                        
+                    
                 # if field is a list of values with simple type
                 elif items_type == "array":
-
                     grp_fields = [c for c in grp_fields if c in data.columns]
 
                     if field in data.columns:
@@ -190,7 +195,6 @@ class DataFrameToCollection():
                                              .apply(self._make_list_of_distinct)
 
                         reshaped_fields.append(reshaped_field)
-
                 else:
 
                     grp_fields = [c for c in grp_fields if c in data.columns]
@@ -203,9 +207,9 @@ class DataFrameToCollection():
                         reshaped_fields.append(reshaped_field)
 
         if len(reshaped_fields) > 0:
-
+            
             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
-
+            
             if _final_step:
                 # dropping the index names if it is the final step,
                 # if not the index is needed for merging
@@ -214,7 +218,7 @@ class DataFrameToCollection():
                                    .reset_index(drop=False)
 
                 self._log.info("Done reshaping the dataframe to a list of documents")
-
+                
             return reshaped_fields
 
         else:
@@ -267,18 +271,28 @@ class DataFrameToCollection():
          entries are arbitrary objects
          (pandas unique() method does not work if entries are of complex types)
         '''
-        uniques = pd.DataFrame({"temp": x.tolist()})\
-                    .assign(temp_str=lambda y: y["temp"].astype(str))\
+            
+    
+        if x.size == 1:
+            return_value = x.tolist()
+            if return_value == [{}]:
+                return []
+            return return_value
+        
+
+        uniques = pd.DataFrame({"temp": x.values})\
+                    .assign(temp_str=lambda y: y["temp"].astype(np.str))\
                     .drop_duplicates(subset=["temp_str"])\
                     .drop("temp_str", axis=1).iloc[:, 0].tolist()
-
+        
+    
         def is_empty(y):
             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
             is_empty_list = (isinstance(y, list) and (len(y) == 0))
             return is_empty_dict or is_empty_list
 
         return [el for el in uniques if not is_empty(el)]
-
+        
     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
         '''
         return: list of unique values from a Series where