ソースを参照

Optimizations, in _make_list_of_distinct return value right away if it only contains one value

ogert 5 年 前
コミット
e8c4acf7c1
共有1 個のファイルを変更した39 個の追加25 個の削除を含む
  1. 39 25
      cdplib/db_migration/DataFrameToCollection.py

+ 39 - 25
cdplib/db_migration/DataFrameToCollection.py

@@ -10,8 +10,10 @@ Created on Mon Jul 22 11:05:47 2019
 """
 """
 
 
 import pandas as pd
 import pandas as pd
+import numpy as np
 import os
 import os
 import sys
 import sys
+import time
 
 
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
 
 
@@ -67,10 +69,11 @@ class DataFrameToCollection():
          grp_fields and reshape it accordingly, the result is a pandas Series.
          grp_fields and reshape it accordingly, the result is a pandas Series.
          In the end all the series are collected and concatenated.
          In the end all the series are collected and concatenated.
         '''
         '''
-        from copy import deepcopy
 
 
+        from copy import deepcopy
+        
         data = self._melt_duplicated_columns(data)
         data = self._melt_duplicated_columns(data)
-
+        
         reshaped_fields = []
         reshaped_fields = []
 
 
         if schema is None:
         if schema is None:
@@ -87,10 +90,10 @@ class DataFrameToCollection():
             if field_type not in ["array", "object"]:
             if field_type not in ["array", "object"]:
 
 
                 grp_fields = [c for c in grp_fields if c in data.columns]
                 grp_fields = [c for c in grp_fields if c in data.columns]
-
+                
                 # check that there is only one possible value of this field
                 # check that there is only one possible value of this field
                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
-
+                
                 if n_distinct_values != 1:
                 if n_distinct_values != 1:
                     err = "Field {0} is not unique with respect to {1}"\
                     err = "Field {0} is not unique with respect to {1}"\
                           .format(field, grp_fields)
                           .format(field, grp_fields)
@@ -111,7 +114,7 @@ class DataFrameToCollection():
             elif field_type == "object":
             elif field_type == "object":
 
 
                 sub_schema = deepcopy(schema["properties"][field])
                 sub_schema = deepcopy(schema["properties"][field])
-            
+                
                 # rename sub-schema properties to match with data column names
                 # rename sub-schema properties to match with data column names
                 sub_schema["properties"] =\
                 sub_schema["properties"] =\
                     {".".join([field, k]): v for k, v
                     {".".join([field, k]): v for k, v
@@ -125,6 +128,7 @@ class DataFrameToCollection():
                 
                 
                 # Need to be checked since child elements can be empty
                 # Need to be checked since child elements can be empty
                 if sub_data is not None:
                 if sub_data is not None:
+                    
                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
                     reshaped_field.name = field
                     reshaped_field.name = field
     
     
@@ -133,23 +137,23 @@ class DataFrameToCollection():
             # if field is a list of dictionaries
             # if field is a list of dictionaries
             elif field_type == "array":
             elif field_type == "array":
              
              
-
+               
                 items_type = schema["properties"][field]["items"]["bsonType"]
                 items_type = schema["properties"][field]["items"]["bsonType"]
 
 
                 if items_type == "object":
                 if items_type == "object":
-
+                    array_object = time.time()
                     sub_schema = deepcopy(schema["properties"][field]["items"])
                     sub_schema = deepcopy(schema["properties"][field]["items"])
 
 
                     # rename sub-schema properties to match data column names
                     # rename sub-schema properties to match data column names
                     sub_schema["properties"] =\
                     sub_schema["properties"] =\
                         {".".join([field, k]): v for k, v in
                         {".".join([field, k]): v for k, v in
                          sub_schema["properties"].items()}
                          sub_schema["properties"].items()}
-
+                    
                     # extend grp fields by sub-fields of field simple types
                     # extend grp fields by sub-fields of field simple types
                     sub_grp_fields = [f for f in sub_schema["properties"]
                     sub_grp_fields = [f for f in sub_schema["properties"]
                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
                                       and (f in data.columns)]
                                       and (f in data.columns)]
-
+                    
                     if len(sub_grp_fields) == 0:
                     if len(sub_grp_fields) == 0:
                         err = ("One of the sub-keys in a list of documents"
                         err = ("One of the sub-keys in a list of documents"
                                " must be of simple type for the field {}"
                                " must be of simple type for the field {}"
@@ -157,31 +161,32 @@ class DataFrameToCollection():
 
 
                         self._log.error(err)
                         self._log.error(err)
                         raise Exception(err)
                         raise Exception(err)
-                        
+                    
                     # group and reshape sub-fields with complex types
                     # group and reshape sub-fields with complex types
                     sub_data = self.to_list_of_documents(
                     sub_data = self.to_list_of_documents(
                                 data=data,
                                 data=data,
                                 schema=sub_schema,
                                 schema=sub_schema,
                                 grp_fields=grp_fields + sub_grp_fields,
                                 grp_fields=grp_fields + sub_grp_fields,
                                 _final_step=False)
                                 _final_step=False)
-
+                    
                     if sub_data is not None:
                     if sub_data is not None:
-
+                        
                         # gether the results into a list of dictionaries
                         # gether the results into a list of dictionaries
                         sub_data = sub_data.apply(self._make_dict, axis=1)
                         sub_data = sub_data.apply(self._make_dict, axis=1)
-
+                        
                         sub_data.name = field
                         sub_data.name = field
                         sub_data = sub_data.reset_index(grp_fields)
                         sub_data = sub_data.reset_index(grp_fields)
-
+                        ######################################################
+                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
                         reshaped_field =\
                         reshaped_field =\
                             sub_data.groupby(grp_fields, sort=False)[field]\
                             sub_data.groupby(grp_fields, sort=False)[field]\
                                     .apply(self._make_list_of_distinct)
                                     .apply(self._make_list_of_distinct)
-
+                        ######################################################
                         reshaped_fields.append(reshaped_field)
                         reshaped_fields.append(reshaped_field)
-
+                        
+                    
                 # if field is a list of values with simple type
                 # if field is a list of values with simple type
                 elif items_type == "array":
                 elif items_type == "array":
-
                     grp_fields = [c for c in grp_fields if c in data.columns]
                     grp_fields = [c for c in grp_fields if c in data.columns]
 
 
                     if field in data.columns:
                     if field in data.columns:
@@ -190,7 +195,6 @@ class DataFrameToCollection():
                                              .apply(self._make_list_of_distinct)
                                              .apply(self._make_list_of_distinct)
 
 
                         reshaped_fields.append(reshaped_field)
                         reshaped_fields.append(reshaped_field)
-
                 else:
                 else:
 
 
                     grp_fields = [c for c in grp_fields if c in data.columns]
                     grp_fields = [c for c in grp_fields if c in data.columns]
@@ -203,9 +207,9 @@ class DataFrameToCollection():
                         reshaped_fields.append(reshaped_field)
                         reshaped_fields.append(reshaped_field)
 
 
         if len(reshaped_fields) > 0:
         if len(reshaped_fields) > 0:
-
+            
             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
-
+            
             if _final_step:
             if _final_step:
                 # dropping the index names if it is the final step,
                 # dropping the index names if it is the final step,
                 # if not the index is needed for merging
                 # if not the index is needed for merging
@@ -214,7 +218,7 @@ class DataFrameToCollection():
                                    .reset_index(drop=False)
                                    .reset_index(drop=False)
 
 
                 self._log.info("Done reshaping the dataframe to a list of documents")
                 self._log.info("Done reshaping the dataframe to a list of documents")
-
+                
             return reshaped_fields
             return reshaped_fields
 
 
         else:
         else:
@@ -267,18 +271,28 @@ class DataFrameToCollection():
          entries are arbitrary objects
          entries are arbitrary objects
          (pandas unique() method does not work if entries are of complex types)
          (pandas unique() method does not work if entries are of complex types)
         '''
         '''
-        uniques = pd.DataFrame({"temp": x.tolist()})\
-                    .assign(temp_str=lambda y: y["temp"].astype(str))\
+            
+    
+        if x.size == 1:
+            return_value = x.tolist()
+            if return_value == [{}]:
+                return []
+            return return_value
+        
+
+        uniques = pd.DataFrame({"temp": x.values})\
+                    .assign(temp_str=lambda y: y["temp"].astype(np.str))\
                     .drop_duplicates(subset=["temp_str"])\
                     .drop_duplicates(subset=["temp_str"])\
                     .drop("temp_str", axis=1).iloc[:, 0].tolist()
                     .drop("temp_str", axis=1).iloc[:, 0].tolist()
-
+        
+    
         def is_empty(y):
         def is_empty(y):
             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
             is_empty_list = (isinstance(y, list) and (len(y) == 0))
             is_empty_list = (isinstance(y, list) and (len(y) == 0))
             return is_empty_dict or is_empty_list
             return is_empty_dict or is_empty_list
 
 
         return [el for el in uniques if not is_empty(el)]
         return [el for el in uniques if not is_empty(el)]
-
+        
     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
         '''
         '''
         return: list of unique values from a Series where
         return: list of unique values from a Series where