|
@@ -19,6 +19,8 @@ from cdplib.db_migration.ParseJsonSchema import ParseJsonSchema
|
|
from cdplib.utils.ExceptionsHandler import ExceptionsHandler
|
|
from cdplib.utils.ExceptionsHandler import ExceptionsHandler
|
|
from cdplib.utils.CleaningUtils import CleaningUtils
|
|
from cdplib.utils.CleaningUtils import CleaningUtils
|
|
from cdplib.log import Log
|
|
from cdplib.log import Log
|
|
|
|
+import json
|
|
|
|
+from boltons.iterutils import remap
|
|
|
|
|
|
class MigrationCleaning:
|
|
class MigrationCleaning:
|
|
'''
|
|
'''
|
|
@@ -376,6 +378,10 @@ class MigrationCleaning:
|
|
data[column] = data[column].astype(python_type)
|
|
data[column] = data[column].astype(python_type)
|
|
python_type = object
|
|
python_type = object
|
|
data[column] = data[column].astype(python_type)
|
|
data[column] = data[column].astype(python_type)
|
|
|
|
+
|
|
|
|
+ elif python_type == float:
|
|
|
|
+ data = data.fillna(np.inf)
|
|
|
|
+ data[column] = data[column].astype(python_type)
|
|
|
|
|
|
else:
|
|
else:
|
|
|
|
|
|
@@ -523,6 +529,28 @@ class MigrationCleaning:
|
|
continue
|
|
continue
|
|
|
|
|
|
return data
|
|
return data
|
|
|
|
+
|
|
|
|
+ def drop_columns_with_no_content(self, data: pd.DataFrame) -> pd.DataFrame():
|
|
|
|
+ '''
|
|
|
|
+ '''
|
|
|
|
+ data = data.dropna(how ='all', axis='columns')
|
|
|
|
+ for column in data.columns:
|
|
|
|
+ unique_values = data[column].unique()
|
|
|
|
+ no_content_signs = [None, '-', 'n.a']
|
|
|
|
+ intersection = list(set(unique_values) & set(no_content_signs))
|
|
|
|
+ if len(intersection) - len(unique_values) == 0:
|
|
|
|
+ data = data.drop(columns=[column])
|
|
|
|
+
|
|
|
|
+ return data
|
|
|
|
+
|
|
|
|
+ def clean_json_from_None_object(self, data: pd.DataFrame) -> pd.DataFrame():
|
|
|
|
+ data = data.to_json(date_format="iso")
|
|
|
|
+ data = json.loads(data)
|
|
|
|
+ new_data = remap(data, lambda p, k, v: v is not None)
|
|
|
|
+ new_data = remap(new_data, lambda p, k, v: v != 'None')
|
|
|
|
+ new_data = remap(new_data, lambda p, k, v: v != 'inf')
|
|
|
|
+ new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
|
|
|
|
+ return new_data
|
|
|
|
|
|
def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
|
|
def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
|
|
'''
|
|
'''
|