|
@@ -10,8 +10,10 @@ Created on Mon Jul 22 11:05:47 2019
|
|
|
"""
|
|
|
|
|
|
import pandas as pd
|
|
|
+import numpy as np
|
|
|
import os
|
|
|
import sys
|
|
|
+import time
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
@@ -67,6 +69,7 @@ class DataFrameToCollection():
|
|
|
grp_fields and reshape it accordingly, the result is a pandas Series.
|
|
|
In the end all the series are collected and concatenated.
|
|
|
'''
|
|
|
+
|
|
|
from copy import deepcopy
|
|
|
|
|
|
data = self._melt_duplicated_columns(data)
|
|
@@ -77,7 +80,7 @@ class DataFrameToCollection():
|
|
|
schema = self.schema
|
|
|
|
|
|
for field in schema["properties"]:
|
|
|
-
|
|
|
+
|
|
|
if field not in self._unroll_nested_names(data.columns):
|
|
|
continue
|
|
|
|
|
@@ -90,8 +93,8 @@ class DataFrameToCollection():
|
|
|
|
|
|
# check that there is only one possible value of this field
|
|
|
n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
|
|
|
-
|
|
|
- #n_distinct_valus can be 0 if the column only contains NaN values
|
|
|
+
|
|
|
+ # n_distinct_valus can be 0 if the column only contains NaN values
|
|
|
if n_distinct_values > 1:
|
|
|
err = "Field {0} is not unique with respect to {1}"\
|
|
|
.format(field, grp_fields)
|
|
@@ -112,33 +115,34 @@ class DataFrameToCollection():
|
|
|
elif field_type == "object":
|
|
|
|
|
|
sub_schema = deepcopy(schema["properties"][field])
|
|
|
-
|
|
|
+
|
|
|
# rename sub-schema properties to match with data column names
|
|
|
sub_schema["properties"] =\
|
|
|
{".".join([field, k]): v for k, v
|
|
|
in sub_schema["properties"].items()}
|
|
|
-
|
|
|
+
|
|
|
sub_data = self.to_list_of_documents(
|
|
|
data=data,
|
|
|
schema=sub_schema,
|
|
|
grp_fields=grp_fields,
|
|
|
_final_step=False)
|
|
|
-
|
|
|
+
|
|
|
# Need to be checked since child elements can be empty
|
|
|
if sub_data is not None:
|
|
|
+
|
|
|
reshaped_field = sub_data.apply(self._make_dict, axis=1)
|
|
|
reshaped_field.name = field
|
|
|
-
|
|
|
+
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
|
|
|
# if field is a list of dictionaries
|
|
|
elif field_type == "array":
|
|
|
-
|
|
|
+
|
|
|
|
|
|
items_type = schema["properties"][field]["items"]["bsonType"]
|
|
|
|
|
|
if items_type == "object":
|
|
|
-
|
|
|
+ array_object = time.time()
|
|
|
sub_schema = deepcopy(schema["properties"][field]["items"])
|
|
|
|
|
|
# rename sub-schema properties to match data column names
|
|
@@ -158,7 +162,7 @@ class DataFrameToCollection():
|
|
|
|
|
|
self._log.error(err)
|
|
|
raise Exception(err)
|
|
|
-
|
|
|
+
|
|
|
# group and reshape sub-fields with complex types
|
|
|
sub_data = self.to_list_of_documents(
|
|
|
data=data,
|
|
@@ -173,16 +177,17 @@ class DataFrameToCollection():
|
|
|
|
|
|
sub_data.name = field
|
|
|
sub_data = sub_data.reset_index(grp_fields)
|
|
|
-
|
|
|
+ ######################################################
|
|
|
+ ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
|
|
|
reshaped_field =\
|
|
|
sub_data.groupby(grp_fields, sort=False)[field]\
|
|
|
.apply(self._make_list_of_distinct)
|
|
|
-
|
|
|
+ ######################################################
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
|
|
|
+
|
|
|
# if field is a list of values with simple type
|
|
|
elif items_type == "array":
|
|
|
-
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
|
|
|
|
if field in data.columns:
|
|
@@ -191,7 +196,6 @@ class DataFrameToCollection():
|
|
|
.apply(self._make_list_of_distinct)
|
|
|
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
-
|
|
|
else:
|
|
|
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
@@ -268,10 +272,22 @@ class DataFrameToCollection():
|
|
|
entries are arbitrary objects
|
|
|
(pandas unique() method does not work if entries are of complex types)
|
|
|
'''
|
|
|
- uniques = pd.DataFrame({"temp": x.tolist()})\
|
|
|
- .assign(temp_str=lambda y: y["temp"].astype(str))\
|
|
|
- .drop_duplicates(subset=["temp_str"])\
|
|
|
- .drop("temp_str", axis=1).iloc[:, 0].tolist()
|
|
|
+
|
|
|
+
|
|
|
+ if x.size == 1:
|
|
|
+ uniques = x.tolist()
|
|
|
+ '''
|
|
|
+ if return_value == [{}]:
|
|
|
+ return []
|
|
|
+ return return_value
|
|
|
+ '''
|
|
|
+ else:
|
|
|
+
|
|
|
+ uniques = pd.DataFrame({"temp": x.values})\
|
|
|
+ .assign(temp_str=lambda y: y["temp"].astype(np.str))\
|
|
|
+ .drop_duplicates(subset=["temp_str"])\
|
|
|
+ .drop("temp_str", axis=1).iloc[:, 0].tolist()
|
|
|
+
|
|
|
|
|
|
def is_empty(y):
|
|
|
is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
|
|
@@ -397,11 +413,3 @@ if __name__ == "__main__":
|
|
|
data=df,
|
|
|
schema=schm,
|
|
|
grp_fields=grp_fields)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|