|
@@ -10,8 +10,10 @@ Created on Mon Jul 22 11:05:47 2019
|
|
|
"""
|
|
|
|
|
|
import pandas as pd
|
|
|
+import numpy as np
|
|
|
import os
|
|
|
import sys
|
|
|
+import time
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
@@ -67,10 +69,11 @@ class DataFrameToCollection():
|
|
|
grp_fields and reshape it accordingly, the result is a pandas Series.
|
|
|
In the end all the series are collected and concatenated.
|
|
|
'''
|
|
|
- from copy import deepcopy
|
|
|
|
|
|
+ from copy import deepcopy
|
|
|
+
|
|
|
data = self._melt_duplicated_columns(data)
|
|
|
-
|
|
|
+
|
|
|
reshaped_fields = []
|
|
|
|
|
|
if schema is None:
|
|
@@ -87,10 +90,10 @@ class DataFrameToCollection():
|
|
|
if field_type not in ["array", "object"]:
|
|
|
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
|
-
|
|
|
+
|
|
|
# check that there is only one possible value of this field
|
|
|
n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
|
|
|
-
|
|
|
+
|
|
|
if n_distinct_values != 1:
|
|
|
err = "Field {0} is not unique with respect to {1}"\
|
|
|
.format(field, grp_fields)
|
|
@@ -111,7 +114,7 @@ class DataFrameToCollection():
|
|
|
elif field_type == "object":
|
|
|
|
|
|
sub_schema = deepcopy(schema["properties"][field])
|
|
|
-
|
|
|
+
|
|
|
# rename sub-schema properties to match with data column names
|
|
|
sub_schema["properties"] =\
|
|
|
{".".join([field, k]): v for k, v
|
|
@@ -125,6 +128,7 @@ class DataFrameToCollection():
|
|
|
|
|
|
# Need to be checked since child elements can be empty
|
|
|
if sub_data is not None:
|
|
|
+
|
|
|
reshaped_field = sub_data.apply(self._make_dict, axis=1)
|
|
|
reshaped_field.name = field
|
|
|
|
|
@@ -133,23 +137,23 @@ class DataFrameToCollection():
|
|
|
# if field is a list of dictionaries
|
|
|
elif field_type == "array":
|
|
|
|
|
|
-
|
|
|
+
|
|
|
items_type = schema["properties"][field]["items"]["bsonType"]
|
|
|
|
|
|
if items_type == "object":
|
|
|
-
|
|
|
+ array_object = time.time()
|
|
|
sub_schema = deepcopy(schema["properties"][field]["items"])
|
|
|
|
|
|
# rename sub-schema properties to match data column names
|
|
|
sub_schema["properties"] =\
|
|
|
{".".join([field, k]): v for k, v in
|
|
|
sub_schema["properties"].items()}
|
|
|
-
|
|
|
+
|
|
|
# extend grp fields by sub-fields of field simple types
|
|
|
sub_grp_fields = [f for f in sub_schema["properties"]
|
|
|
if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
|
|
|
and (f in data.columns)]
|
|
|
-
|
|
|
+
|
|
|
if len(sub_grp_fields) == 0:
|
|
|
err = ("One of the sub-keys in a list of documents"
|
|
|
" must be of simple type for the field {}"
|
|
@@ -157,31 +161,32 @@ class DataFrameToCollection():
|
|
|
|
|
|
self._log.error(err)
|
|
|
raise Exception(err)
|
|
|
-
|
|
|
+
|
|
|
# group and reshape sub-fields with complex types
|
|
|
sub_data = self.to_list_of_documents(
|
|
|
data=data,
|
|
|
schema=sub_schema,
|
|
|
grp_fields=grp_fields + sub_grp_fields,
|
|
|
_final_step=False)
|
|
|
-
|
|
|
+
|
|
|
if sub_data is not None:
|
|
|
-
|
|
|
+
|
|
|
# gether the results into a list of dictionaries
|
|
|
sub_data = sub_data.apply(self._make_dict, axis=1)
|
|
|
-
|
|
|
+
|
|
|
sub_data.name = field
|
|
|
sub_data = sub_data.reset_index(grp_fields)
|
|
|
-
|
|
|
+ ######################################################
|
|
|
+ ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
|
|
|
reshaped_field =\
|
|
|
sub_data.groupby(grp_fields, sort=False)[field]\
|
|
|
.apply(self._make_list_of_distinct)
|
|
|
-
|
|
|
+ ######################################################
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
# if field is a list of values with simple type
|
|
|
elif items_type == "array":
|
|
|
-
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
|
|
|
|
if field in data.columns:
|
|
@@ -190,7 +195,6 @@ class DataFrameToCollection():
|
|
|
.apply(self._make_list_of_distinct)
|
|
|
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
-
|
|
|
else:
|
|
|
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
@@ -203,9 +207,9 @@ class DataFrameToCollection():
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
|
|
|
if len(reshaped_fields) > 0:
|
|
|
-
|
|
|
+
|
|
|
reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
|
|
|
-
|
|
|
+
|
|
|
if _final_step:
|
|
|
# dropping the index names if it is the final step,
|
|
|
# if not the index is needed for merging
|
|
@@ -214,7 +218,7 @@ class DataFrameToCollection():
|
|
|
.reset_index(drop=False)
|
|
|
|
|
|
self._log.info("Done reshaping the dataframe to a list of documents")
|
|
|
-
|
|
|
+
|
|
|
return reshaped_fields
|
|
|
|
|
|
else:
|
|
@@ -267,18 +271,28 @@ class DataFrameToCollection():
|
|
|
entries are arbitrary objects
|
|
|
(pandas unique() method does not work if entries are of complex types)
|
|
|
'''
|
|
|
- uniques = pd.DataFrame({"temp": x.tolist()})\
|
|
|
- .assign(temp_str=lambda y: y["temp"].astype(str))\
|
|
|
+
|
|
|
+
|
|
|
+ if x.size == 1:
|
|
|
+ return_value = x.tolist()
|
|
|
+ if return_value == [{}]:
|
|
|
+ return []
|
|
|
+ return return_value
|
|
|
+
|
|
|
+
|
|
|
+ uniques = pd.DataFrame({"temp": x.values})\
|
|
|
+ .assign(temp_str=lambda y: y["temp"].astype(np.str))\
|
|
|
.drop_duplicates(subset=["temp_str"])\
|
|
|
.drop("temp_str", axis=1).iloc[:, 0].tolist()
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
def is_empty(y):
|
|
|
is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
|
|
|
is_empty_list = (isinstance(y, list) and (len(y) == 0))
|
|
|
return is_empty_dict or is_empty_list
|
|
|
|
|
|
return [el for el in uniques if not is_empty(el)]
|
|
|
-
|
|
|
+
|
|
|
def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
|
|
|
'''
|
|
|
return: list of unique values from a Series where
|