|
@@ -5,7 +5,6 @@ Created on Wed Sep 25 08:09:52 2019
|
|
|
|
|
|
@author: tanya
|
|
@author: tanya
|
|
"""
|
|
"""
|
|
-
|
|
|
|
import os
|
|
import os
|
|
import sys
|
|
import sys
|
|
import pandas as pd
|
|
import pandas as pd
|
|
@@ -14,8 +13,8 @@ import gc
|
|
|
|
|
|
sys.path.append(os.getcwd())
|
|
sys.path.append(os.getcwd())
|
|
|
|
|
|
-from cdplib.db_migration.ParseMapping import ParseMapping
|
|
|
|
-from cdplib.db_migration.ParseJsonSchema import ParseJsonSchema
|
|
|
|
|
|
+from libraries.db_migration.ParseMapping import ParseMapping
|
|
|
|
+from libraries.db_migration.ParseJsonSchema import ParseJsonSchema
|
|
from cdplib.utils.ExceptionsHandler import ExceptionsHandler
|
|
from cdplib.utils.ExceptionsHandler import ExceptionsHandler
|
|
from cdplib.utils.CleaningUtils import CleaningUtils
|
|
from cdplib.utils.CleaningUtils import CleaningUtils
|
|
from cdplib.log import Log
|
|
from cdplib.log import Log
|
|
@@ -30,8 +29,9 @@ class MigrationCleaning:
|
|
schema_paths: (str, list),
|
|
schema_paths: (str, list),
|
|
inconsist_report_table: str = None,
|
|
inconsist_report_table: str = None,
|
|
filter_index_columns: (str, list) = None,
|
|
filter_index_columns: (str, list) = None,
|
|
- mapping_source: str = "internal_name",
|
|
|
|
- mapping_target: str = "mongo_name",
|
|
|
|
|
|
+ mapping_source_name_tag: str = "internal_name",
|
|
|
|
+ mapping_target_name_tag: str = "mongo_name",
|
|
|
|
+ target_collection_name: str = None,
|
|
mapping_parser: type = ParseMapping,
|
|
mapping_parser: type = ParseMapping,
|
|
schema_parser: type = ParseJsonSchema):
|
|
schema_parser: type = ParseJsonSchema):
|
|
'''
|
|
'''
|
|
@@ -39,21 +39,41 @@ class MigrationCleaning:
|
|
self._log = Log('Migration Cleaning')
|
|
self._log = Log('Migration Cleaning')
|
|
self._exception_handler = ExceptionsHandler()
|
|
self._exception_handler = ExceptionsHandler()
|
|
|
|
|
|
- assert isinstance(inconsist_report_table, str),\
|
|
|
|
- "Inconsistent report table should be a tablename string"
|
|
|
|
|
|
+ if inconsist_report_table is not None:
|
|
|
|
+ assert isinstance(inconsist_report_table, str),\
|
|
|
|
+ "Inconsistent report table should be a tablename string"
|
|
|
|
|
|
self._inconsist_report_table = inconsist_report_table
|
|
self._inconsist_report_table = inconsist_report_table
|
|
|
|
|
|
- assert isinstance(filter_index_columns, (str, list)),\
|
|
|
|
- "Filter index columns must be a str or a list"
|
|
|
|
|
|
+ if filter_index_columns is not None:
|
|
|
|
+ assert isinstance(filter_index_columns, (str, list)),\
|
|
|
|
+ "Filter index columns must be a str or a list"
|
|
|
|
+
|
|
|
|
+ self._filter_index_columns = list(filter_index_columns)
|
|
|
|
|
|
- self._filter_index_columns = list(filter_index_columns)
|
|
|
|
|
|
+ else:
|
|
|
|
|
|
- self._schema_parser = schema_parser(schema_paths)
|
|
|
|
|
|
+ self._filter_index_columns = None
|
|
|
|
|
|
self._mapping_parser = mapping_parser(mapping_path,
|
|
self._mapping_parser = mapping_parser(mapping_path,
|
|
- source=mapping_source,
|
|
|
|
- target=mapping_target)
|
|
|
|
|
|
+ source_name_tag=mapping_source_name_tag,
|
|
|
|
+ target_name_tag=mapping_target_name_tag,
|
|
|
|
+ target_collection_name=target_collection_name)
|
|
|
|
+
|
|
|
|
+ if target_collection_name is not None:
|
|
|
|
+
|
|
|
|
+ schema_names = [os.path.basename(schema_path) for schema_path in schema_paths]
|
|
|
|
+
|
|
|
|
+ convention_schema_name = "schema_" + target_collection_name + ".json"
|
|
|
|
+
|
|
|
|
+ if convention_schema_name not in schema_names:
|
|
|
|
+ self._log.log_and_raise_warning("Found no matching of the collection name {0} in schema paths {1}"
|
|
|
|
+ .format(target_collection_name, schema_paths))
|
|
|
|
+ else:
|
|
|
|
+ self._schema_parser = schema_parser(schema_paths[schema_names.index(convention_schema_name)])
|
|
|
|
+
|
|
|
|
+ else:
|
|
|
|
+ self._schema_parser = schema_parser(schema_paths)
|
|
|
|
|
|
self._mapping_path = mapping_path
|
|
self._mapping_path = mapping_path
|
|
self._schema_paths = schema_paths
|
|
self._schema_paths = schema_paths
|
|
@@ -68,7 +88,7 @@ class MigrationCleaning:
|
|
"Parameter 'data' must be a pandas dataframe"
|
|
"Parameter 'data' must be a pandas dataframe"
|
|
|
|
|
|
@property
|
|
@property
|
|
- def _field_mapping(self):
|
|
|
|
|
|
+ def _field_mapping(self, collection_name: str = None):
|
|
'''
|
|
'''
|
|
'''
|
|
'''
|
|
return self._mapping_parser.get_field_mapping()
|
|
return self._mapping_parser.get_field_mapping()
|
|
@@ -503,6 +523,7 @@ class MigrationCleaning:
|
|
|
|
|
|
return data
|
|
return data
|
|
|
|
|
|
|
|
+ """
|
|
def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
|
|
def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
|
|
'''
|
|
'''
|
|
'''
|
|
'''
|
|
@@ -511,6 +532,7 @@ class MigrationCleaning:
|
|
fields = self._mapping_parser.get_fields_restricted_to_collecton(collection_name=collection_name)
|
|
fields = self._mapping_parser.get_fields_restricted_to_collecton(collection_name=collection_name)
|
|
|
|
|
|
return data[[c for c in data.columns if (c in fields) or (c in mongo_fields)]]
|
|
return data[[c for c in data.columns if (c in fields) or (c in mongo_fields)]]
|
|
|
|
+ """
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
@@ -532,8 +554,8 @@ if __name__ == "__main__":
|
|
cleaner = MigrationCleaning(
|
|
cleaner = MigrationCleaning(
|
|
mapping_path=mapping_path,
|
|
mapping_path=mapping_path,
|
|
schema_paths=schema_paths,
|
|
schema_paths=schema_paths,
|
|
- mapping_source="internal_name",
|
|
|
|
- mapping_target="mongo_name",
|
|
|
|
|
|
+ mapping_source_name_tag="internal_name",
|
|
|
|
+ mapping_target_name_tag="mongo_name",
|
|
filter_index_columns=["radsatznummer"],
|
|
filter_index_columns=["radsatznummer"],
|
|
inconsist_report_table=inconsist_report_table)
|
|
inconsist_report_table=inconsist_report_table)
|
|
|
|
|
|
@@ -562,4 +584,3 @@ if __name__ == "__main__":
|
|
data = cleaner.filter_notallowed_values(data)
|
|
data = cleaner.filter_notallowed_values(data)
|
|
|
|
|
|
print("Done!")
|
|
print("Done!")
|
|
-
|
|
|