|
@@ -19,6 +19,8 @@ from cdplib.db_migration.ParseJsonSchema import ParseJsonSchema
|
|
|
from cdplib.utils.ExceptionsHandler import ExceptionsHandler
|
|
|
from cdplib.utils.CleaningUtils import CleaningUtils
|
|
|
from cdplib.log import Log
|
|
|
+import json
|
|
|
+from boltons.iterutils import remap
|
|
|
|
|
|
class MigrationCleaning:
|
|
|
'''
|
|
@@ -26,7 +28,7 @@ class MigrationCleaning:
|
|
|
We keep the correcting and the filtering methods separated,
|
|
|
since there might be other custom steps in between.
|
|
|
'''
|
|
|
- def __init__(self, mapping_path: str,
|
|
|
+ def __init__(self, mapping_paths: (str, list),
|
|
|
schema_paths: (str, list),
|
|
|
inconsist_report_table: str = None,
|
|
|
filter_index_columns: (str, list) = None,
|
|
@@ -51,11 +53,11 @@ class MigrationCleaning:
|
|
|
|
|
|
self._schema_parser = schema_parser(schema_paths)
|
|
|
|
|
|
- self._mapping_parser = mapping_parser(mapping_path,
|
|
|
+ self._mapping_parser = mapping_parser(mapping_paths,
|
|
|
source=mapping_source,
|
|
|
target=mapping_target)
|
|
|
|
|
|
- self._mapping_path = mapping_path
|
|
|
+ self._mapping_paths = mapping_paths
|
|
|
self._schema_paths = schema_paths
|
|
|
|
|
|
from cdplib.db_handlers.SQLHandler import SQLHandler
|
|
@@ -92,6 +94,11 @@ class MigrationCleaning:
|
|
|
@property
|
|
|
def _default_values(self):
|
|
|
'''
|
|
|
+ Returns a dictonary in which the default values of the mongo schema
|
|
|
+ are mapped to the default values of the migration mapping. In migration
|
|
|
+ mapping the default values should be specified as the values which
|
|
|
+ doesn't contain any information and can be seen therefore as an empty
|
|
|
+ value.
|
|
|
'''
|
|
|
default_values = {}
|
|
|
|
|
@@ -104,14 +111,13 @@ class MigrationCleaning:
|
|
|
continue
|
|
|
|
|
|
elif target_field not in target_default_values:
|
|
|
-
|
|
|
target_default_values[target_field] = np.nan
|
|
|
|
|
|
default_values[source_field] = {
|
|
|
target_default_values[target_field]:
|
|
|
source_default_values[source_field]
|
|
|
}
|
|
|
-
|
|
|
+
|
|
|
return default_values
|
|
|
|
|
|
@property
|
|
@@ -119,7 +125,6 @@ class MigrationCleaning:
|
|
|
'''
|
|
|
'''
|
|
|
target_types = self._schema_parser.get_python_types()
|
|
|
-
|
|
|
result = {}
|
|
|
|
|
|
for source_field, target_field in self._field_mapping.items():
|
|
@@ -229,7 +234,7 @@ class MigrationCleaning:
|
|
|
data_inconsist = data.assign(reason=reason)\
|
|
|
.loc[invalid_mask]\
|
|
|
.reset_index(drop=True)
|
|
|
-
|
|
|
+
|
|
|
if db.check_if_table_exists(self._inconsist_report_table):
|
|
|
columns = db.get_column_names(tablename=self._inconsist_report_table)
|
|
|
|
|
@@ -264,8 +269,6 @@ class MigrationCleaning:
|
|
|
|
|
|
data = data.loc[~all_index.isin(nok_index)].reset_index(drop=True)
|
|
|
|
|
|
- #self._sql_db.release(db)
|
|
|
-
|
|
|
return data
|
|
|
|
|
|
def _replace_values(self, data: pd.DataFrame,
|
|
@@ -298,10 +301,11 @@ class MigrationCleaning:
|
|
|
for key, values in d.items():
|
|
|
|
|
|
if not default:
|
|
|
-
|
|
|
+
|
|
|
mask = (data[column].astype(str).isin(values))
|
|
|
|
|
|
else:
|
|
|
+
|
|
|
mask = (data[column].isin(values))
|
|
|
|
|
|
if default:
|
|
@@ -309,7 +313,7 @@ class MigrationCleaning:
|
|
|
mask = mask | (data[column].isnull())
|
|
|
|
|
|
data.loc[mask, column] = key
|
|
|
-
|
|
|
+
|
|
|
data[column] = data[column].astype(dtype)
|
|
|
|
|
|
except Exception as e:
|
|
@@ -357,12 +361,11 @@ class MigrationCleaning:
|
|
|
|
|
|
elif python_type == bool:
|
|
|
|
|
|
- data[column] = data[column].str.lower()
|
|
|
- accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
|
|
|
+ accepted_bool = {'ja': True, 'j': True, '1': True, 1: True,
|
|
|
'yes': True, 'y': True, 'true':True,
|
|
|
't': True, 'nein': False, 'n': False,
|
|
|
'no': False, 'false': False, 'f': False,
|
|
|
- '0': False, 0:False}
|
|
|
+ '0': False, 0: False}
|
|
|
data[column] = data[column].map(accepted_bool)
|
|
|
data[column] = data[column].astype(bool)
|
|
|
|
|
@@ -375,6 +378,10 @@ class MigrationCleaning:
|
|
|
data[column] = data[column].astype(python_type)
|
|
|
python_type = object
|
|
|
data[column] = data[column].astype(python_type)
|
|
|
+
|
|
|
+ elif python_type == float:
|
|
|
+ data = data.fillna(np.inf)
|
|
|
+ data[column] = data[column].astype(python_type)
|
|
|
|
|
|
else:
|
|
|
|
|
@@ -382,10 +389,6 @@ class MigrationCleaning:
|
|
|
data[column] = data[column].astype(python_type)
|
|
|
|
|
|
if data[column].dtype != python_type:
|
|
|
- print('---------------------------------------------')
|
|
|
- print(data[column].to_csv(column))
|
|
|
- print(python_type)
|
|
|
- print(column)
|
|
|
|
|
|
self.log.warning(("After conversion type in {0} "
|
|
|
"should be {1} "
|
|
@@ -436,7 +439,11 @@ class MigrationCleaning:
|
|
|
continue
|
|
|
|
|
|
python_type = self._python_types[column]
|
|
|
-
|
|
|
+
|
|
|
+ #Needs to be done since coumn dtype of strings is a object
|
|
|
+ if python_type == str:
|
|
|
+ python_type = object
|
|
|
+
|
|
|
if data[column].dtype != python_type:
|
|
|
|
|
|
def mismatch_type(x):
|
|
@@ -444,7 +451,7 @@ class MigrationCleaning:
|
|
|
|
|
|
invalid_mask = data[column].apply(mismatch_type)
|
|
|
|
|
|
- reason = "Type mismatch if field {}".format(column)
|
|
|
+ reason = "Type mismatch in field {}".format(column)
|
|
|
|
|
|
data = self._filter_invalid_data(data=data,
|
|
|
invalid_mask=invalid_mask,
|
|
@@ -466,12 +473,12 @@ class MigrationCleaning:
|
|
|
|
|
|
invalid_mask = (~data[column].astype(str).str.match(pattern))
|
|
|
|
|
|
- reason = "Pattern mismatch in field {}".format(column)
|
|
|
+ reason = "Pattern mismatch in field {0}. Pattern: {1}Example: {2}"\
|
|
|
+ .format(column,pattern,data.iloc[0][column])
|
|
|
|
|
|
data = self._filter_invalid_data(data=data,
|
|
|
invalid_mask=invalid_mask,
|
|
|
reason=reason)
|
|
|
-
|
|
|
return data
|
|
|
|
|
|
def filter_invalid_values(self, data: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -504,7 +511,7 @@ class MigrationCleaning:
|
|
|
reason=reason)
|
|
|
|
|
|
elif column in self._allowed_values:
|
|
|
-
|
|
|
+
|
|
|
allowed_values = self._allowed_values[column]
|
|
|
|
|
|
invalid_mask = (~data[column].isin(allowed_values))
|
|
@@ -522,15 +529,37 @@ class MigrationCleaning:
|
|
|
continue
|
|
|
|
|
|
return data
|
|
|
+
|
|
|
+ def drop_columns_with_no_content(self, data: pd.DataFrame) -> pd.DataFrame():
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ data = data.dropna(how ='all', axis='columns')
|
|
|
+ for column in data.columns:
|
|
|
+ unique_values = data[column].unique()
|
|
|
+ no_content_signs = [None, '-', 'n.a']
|
|
|
+ intersection = list(set(unique_values) & set(no_content_signs))
|
|
|
+ if len(intersection) - len(unique_values) == 0:
|
|
|
+ data = data.drop(columns=[column])
|
|
|
+
|
|
|
+ return data
|
|
|
+
|
|
|
+ def clean_json_from_None_object(self, data: pd.DataFrame) -> pd.DataFrame():
|
|
|
+ data = data.to_json(date_format="iso")
|
|
|
+ data = json.loads(data)
|
|
|
+ new_data = remap(data, lambda p, k, v: v is not None)
|
|
|
+ new_data = remap(new_data, lambda p, k, v: v != 'None')
|
|
|
+ new_data = remap(new_data, lambda p, k, v: v != 'inf')
|
|
|
+ new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
|
|
|
+ return new_data
|
|
|
|
|
|
def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
|
|
|
'''
|
|
|
'''
|
|
|
mongo_fields = self._schema_parser.get_fields_restricted_to_collection(collection_name=collection_name)
|
|
|
-
|
|
|
- fields = self._mapping_parser.get_fields_restricted_to_collecton(collection_name=collection_name)
|
|
|
-
|
|
|
- return data[[c for c in data.columns if (c in fields) or (c in mongo_fields)]]
|
|
|
+
|
|
|
+ mapping_fields = self._mapping_parser.get_fields_restricted_to_collection(collection_name=collection_name)
|
|
|
+
|
|
|
+ return data[[c for c in data.columns if (c in mapping_fields) or (c in mongo_fields)]]
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
@@ -550,7 +579,7 @@ if __name__ == "__main__":
|
|
|
if all([os.path.isfile(p) for p in schema_paths + [mapping_path]]):
|
|
|
|
|
|
cleaner = MigrationCleaning(
|
|
|
- mapping_path=mapping_path,
|
|
|
+ mapping_paths=mapping_path,
|
|
|
schema_paths=schema_paths,
|
|
|
mapping_source="internal_name",
|
|
|
mapping_target="mongo_name",
|