|
@@ -71,16 +71,16 @@ class DataFrameToCollection():
|
|
|
'''
|
|
|
|
|
|
from copy import deepcopy
|
|
|
-
|
|
|
+
|
|
|
data = self._melt_duplicated_columns(data)
|
|
|
-
|
|
|
+
|
|
|
reshaped_fields = []
|
|
|
|
|
|
if schema is None:
|
|
|
schema = self.schema
|
|
|
|
|
|
for field in schema["properties"]:
|
|
|
-
|
|
|
+
|
|
|
if field not in self._unroll_nested_names(data.columns):
|
|
|
continue
|
|
|
|
|
@@ -90,10 +90,10 @@ class DataFrameToCollection():
|
|
|
if field_type not in ["array", "object"]:
|
|
|
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
|
-
|
|
|
+
|
|
|
# check that there is only one possible value of this field
|
|
|
n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
|
|
|
-
|
|
|
+
|
|
|
# n_distinct_valus can be 0 if the column only contains NaN values
|
|
|
if n_distinct_values > 1:
|
|
|
err = "Field {0} is not unique with respect to {1}"\
|
|
@@ -115,30 +115,30 @@ class DataFrameToCollection():
|
|
|
elif field_type == "object":
|
|
|
|
|
|
sub_schema = deepcopy(schema["properties"][field])
|
|
|
-
|
|
|
+
|
|
|
# rename sub-schema properties to match with data column names
|
|
|
sub_schema["properties"] =\
|
|
|
{".".join([field, k]): v for k, v
|
|
|
in sub_schema["properties"].items()}
|
|
|
-
|
|
|
+
|
|
|
sub_data = self.to_list_of_documents(
|
|
|
data=data,
|
|
|
schema=sub_schema,
|
|
|
grp_fields=grp_fields,
|
|
|
_final_step=False)
|
|
|
-
|
|
|
+
|
|
|
# Need to be checked since child elements can be empty
|
|
|
if sub_data is not None:
|
|
|
-
|
|
|
+
|
|
|
reshaped_field = sub_data.apply(self._make_dict, axis=1)
|
|
|
reshaped_field.name = field
|
|
|
-
|
|
|
+
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
|
|
|
# if field is a list of dictionaries
|
|
|
elif field_type == "array":
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
items_type = schema["properties"][field]["items"]["bsonType"]
|
|
|
|
|
|
if items_type == "object":
|
|
@@ -149,12 +149,12 @@ class DataFrameToCollection():
|
|
|
sub_schema["properties"] =\
|
|
|
{".".join([field, k]): v for k, v in
|
|
|
sub_schema["properties"].items()}
|
|
|
-
|
|
|
+
|
|
|
# extend grp fields by sub-fields of field simple types
|
|
|
sub_grp_fields = [f for f in sub_schema["properties"]
|
|
|
if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
|
|
|
and (f in data.columns)]
|
|
|
-
|
|
|
+
|
|
|
if len(sub_grp_fields) == 0:
|
|
|
err = ("One of the sub-keys in a list of documents"
|
|
|
" must be of simple type for the field {}"
|
|
@@ -162,19 +162,19 @@ class DataFrameToCollection():
|
|
|
|
|
|
self._log.error(err)
|
|
|
raise Exception(err)
|
|
|
-
|
|
|
+
|
|
|
# group and reshape sub-fields with complex types
|
|
|
sub_data = self.to_list_of_documents(
|
|
|
data=data,
|
|
|
schema=sub_schema,
|
|
|
grp_fields=grp_fields + sub_grp_fields,
|
|
|
_final_step=False)
|
|
|
-
|
|
|
+
|
|
|
if sub_data is not None:
|
|
|
-
|
|
|
+
|
|
|
# gether the results into a list of dictionaries
|
|
|
sub_data = sub_data.apply(self._make_dict, axis=1)
|
|
|
-
|
|
|
+
|
|
|
sub_data.name = field
|
|
|
sub_data = sub_data.reset_index(grp_fields)
|
|
|
######################################################
|
|
@@ -184,8 +184,8 @@ class DataFrameToCollection():
|
|
|
.apply(self._make_list_of_distinct)
|
|
|
######################################################
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
# if field is a list of values with simple type
|
|
|
elif items_type == "array":
|
|
|
grp_fields = [c for c in grp_fields if c in data.columns]
|
|
@@ -208,9 +208,9 @@ class DataFrameToCollection():
|
|
|
reshaped_fields.append(reshaped_field)
|
|
|
|
|
|
if len(reshaped_fields) > 0:
|
|
|
-
|
|
|
+
|
|
|
reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
|
|
|
-
|
|
|
+
|
|
|
if _final_step:
|
|
|
# dropping the index names if it is the final step,
|
|
|
# if not the index is needed for merging
|
|
@@ -219,7 +219,7 @@ class DataFrameToCollection():
|
|
|
.reset_index(drop=False)
|
|
|
|
|
|
self._log.info("Done reshaping the dataframe to a list of documents")
|
|
|
-
|
|
|
+
|
|
|
return reshaped_fields
|
|
|
|
|
|
else:
|
|
@@ -272,8 +272,8 @@ class DataFrameToCollection():
|
|
|
entries are arbitrary objects
|
|
|
(pandas unique() method does not work if entries are of complex types)
|
|
|
'''
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
if x.size == 1:
|
|
|
uniques = x.tolist()
|
|
|
'''
|
|
@@ -287,15 +287,15 @@ class DataFrameToCollection():
|
|
|
.assign(temp_str=lambda y: y["temp"].astype(np.str))\
|
|
|
.drop_duplicates(subset=["temp_str"])\
|
|
|
.drop("temp_str", axis=1).iloc[:, 0].tolist()
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
def is_empty(y):
|
|
|
is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
|
|
|
is_empty_list = (isinstance(y, list) and (len(y) == 0))
|
|
|
return is_empty_dict or is_empty_list
|
|
|
|
|
|
return [el for el in uniques if not is_empty(el)]
|
|
|
-
|
|
|
+
|
|
|
def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
|
|
|
'''
|
|
|
return: list of unique values from a Series where
|
|
@@ -413,11 +413,3 @@ if __name__ == "__main__":
|
|
|
data=df,
|
|
|
schema=schm,
|
|
|
grp_fields=grp_fields)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|