5 年前 · 2348496e68
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -19,6 +19,8 @@ from cdplib.db_migration.ParseJsonSchema import ParseJsonSchema
 
				 from cdplib.utils.ExceptionsHandler import ExceptionsHandler
			
 
				 from cdplib.utils.CleaningUtils import CleaningUtils
			
 
				 from cdplib.log import Log
			
 
				+import json
			
 
				+from boltons.iterutils import remap
			
 
				 
			
 
				 class MigrationCleaning:
			
 
				     '''
			
@@ -26,7 +28,7 @@ class MigrationCleaning:
 
				     We keep the correcting and the filtering methods separated,
			
 
				     since there might be other custom steps in between.
			
 
				     '''
			
 
				-    def __init__(self, mapping_path: str,
			
 
				+    def __init__(self, mapping_paths: (str, list),
			
 
				                  schema_paths: (str, list),
			
 
				                  inconsist_report_table: str = None,
			
 
				                  filter_index_columns: (str, list) = None,
			
@@ -51,11 +53,11 @@ class MigrationCleaning:
 
				 
			
 
				         self._schema_parser = schema_parser(schema_paths)
			
 
				 
			
 
				-        self._mapping_parser = mapping_parser(mapping_path,
			
 
				+        self._mapping_parser = mapping_parser(mapping_paths,
			
 
				                                               source=mapping_source,
			
 
				                                               target=mapping_target)
			
 
				 
			
 
				-        self._mapping_path = mapping_path
			
 
				+        self._mapping_paths = mapping_paths
			
 
				         self._schema_paths = schema_paths
			
 
				 
			
 
				         from cdplib.db_handlers.SQLHandler import SQLHandler
			
@@ -67,6 +69,21 @@ class MigrationCleaning:
 
				         assert(isinstance(data, pd.DataFrame)),\
			
 
				             "Parameter 'data' must be a pandas dataframe"
			
 
				 
			
 
				+    def append_mapping_path(self, mapping_path = str):
			
 
				+        '''
			
 
				+        Appends a new mapping to the _mapping_paths variable from MigartionCleaning
			
 
				+        and to mapping_paths from ParseMapping
			
 
				+        '''
			
 
				+        assert(isinstance(mapping_path, str)),\
			
 
				+            "Parameter 'mapping_path' must be a string"
			
 
				+
			
 
				+        mapping_paths = []
			
 
				+        mapping_paths.append(self._mapping_paths)
			
 
				+        mapping_paths.append(mapping_path)
			
 
				+        self._mapping_paths = mapping_paths
			
 
				+        self._mapping_parser._mapping_paths = mapping_paths
			
 
				+
			
 
				+
			
 
				     @property
			
 
				     def _field_mapping(self):
			
 
				         '''
			
@@ -92,6 +109,11 @@ class MigrationCleaning:
 
				     @property
			
 
				     def _default_values(self):
			
 
				         '''
			
 
				+        Returns a dictonary in which the default values of the mongo schema
			
 
				+        are mapped to the default values of the migration mapping. In migration
			
 
				+        mapping the default values should be specified as the values which
			
 
				+        doesn't contain any information and can be seen therefore as an empty
			
 
				+        value.
			
 
				         '''
			
 
				         default_values = {}
			
 
				 
			
@@ -104,7 +126,6 @@ class MigrationCleaning:
 
				                 continue
			
 
				 
			
 
				             elif target_field not in target_default_values:
			
 
				-
			
 
				                 target_default_values[target_field] = np.nan
			
 
				 
			
 
				             default_values[source_field] = {
			
@@ -119,7 +140,6 @@ class MigrationCleaning:
 
				         '''
			
 
				         '''
			
 
				         target_types = self._schema_parser.get_python_types()
			
 
				-
			
 
				         result = {}
			
 
				 
			
 
				         for source_field, target_field in self._field_mapping.items():
			
@@ -169,21 +189,6 @@ class MigrationCleaning:
 
				         '''
			
 
				         return self._mapping_parser.get_date_formats()
			
 
				 
			
 
				-    def _get_mongo_schema_info(self, method_name: str):
			
 
				-        '''
			
 
				-        '''
			
 
				-        result = {}
			
 
				-
			
 
				-        target_dict = getattr(self._schema_parser, method_name)()
			
 
				-
			
 
				-        for source_field, target_field in self._field_mapping.items():
			
 
				-
			
 
				-            if target_field in target_dict:
			
 
				-
			
 
				-                result[source_field] = target_dict[target_field]
			
 
				-
			
 
				-        return result
			
 
				-
			
 
				     @property
			
 
				     def _allowed_values(self):
			
 
				         '''
			
@@ -208,6 +213,21 @@ class MigrationCleaning:
 
				         '''
			
 
				         return self._get_mongo_schema_info("get_patterns")
			
 
				 
			
 
				+    def _get_mongo_schema_info(self, method_name: str):
			
 
				+        '''
			
 
				+        '''
			
 
				+        result = {}
			
 
				+
			
 
				+        target_dict = getattr(self._schema_parser, method_name)()
			
 
				+
			
 
				+        for source_field, target_field in self._field_mapping.items():
			
 
				+
			
 
				+            if target_field in target_dict:
			
 
				+
			
 
				+                result[source_field] = target_dict[target_field]
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				     def _filter_invalid_data(self, data: pd.DataFrame,
			
 
				                              invalid_mask: pd.Series,
			
 
				                              reason: (str, pd.Series)) -> pd.DataFrame:
			
@@ -264,8 +284,6 @@ class MigrationCleaning:
 
				 
			
 
				         data = data.loc[~all_index.isin(nok_index)].reset_index(drop=True)
			
 
				 
			
 
				-        #self._sql_db.release(db)
			
 
				-
			
 
				         return data
			
 
				 
			
 
				     def _replace_values(self, data: pd.DataFrame,
			
@@ -302,6 +320,7 @@ class MigrationCleaning:
 
				                         mask = (data[column].astype(str).isin(values))
			
 
				 
			
 
				                     else:
			
 
				+
			
 
				                         mask = (data[column].isin(values))
			
 
				 
			
 
				                     if default:
			
@@ -357,12 +376,11 @@ class MigrationCleaning:
 
				 
			
 
				                 elif python_type == bool:
			
 
				 
			
 
				-                    data[column] = data[column].str.lower()
			
 
				-                    accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
			
 
				+                    accepted_bool = {'ja': True, 'j': True, '1': True, 1: True,
			
 
				                                      'yes': True, 'y': True, 'true':True,
			
 
				                                      't': True, 'nein': False, 'n': False,
			
 
				                                      'no': False, 'false': False, 'f': False,
			
 
				-                                     '0': False, 0:False}
			
 
				+                                     '0': False, 0: False}
			
 
				                     data[column] = data[column].map(accepted_bool)
			
 
				                     data[column] = data[column].astype(bool)
			
 
				 
			
@@ -376,16 +394,17 @@ class MigrationCleaning:
 
				                     python_type = object
			
 
				                     data[column] = data[column].astype(python_type)
			
 
				 
			
 
				+                elif python_type == float:
			
 
				+                    
			
 
				+                    data[column] = data[column].fillna(np.inf)
			
 
				+                    data[column] = data[column].astype(python_type)
			
 
				+
			
 
				                 else:
			
 
				 
			
 
				                     data = data.copy(deep=True)
			
 
				                     data[column] = data[column].astype(python_type)
			
 
				 
			
 
				                 if data[column].dtype != python_type:
			
 
				-                    print('---------------------------------------------')
			
 
				-                    print(data[column].to_csv(column))
			
 
				-                    print(python_type)
			
 
				-                    print(column)
			
 
				 
			
 
				                     self.log.warning(("After conversion type in {0} "
			
 
				                                        "should be {1} "
			
@@ -437,6 +456,10 @@ class MigrationCleaning:
 
				 
			
 
				             python_type = self._python_types[column]
			
 
				 
			
 
				+            #Needs to be done since coumn dtype of strings is a object
			
 
				+            if python_type == str:
			
 
				+                python_type = object
			
 
				+
			
 
				             if data[column].dtype != python_type:
			
 
				 
			
 
				                 def mismatch_type(x):
			
@@ -444,7 +467,7 @@ class MigrationCleaning:
 
				 
			
 
				                 invalid_mask = data[column].apply(mismatch_type)
			
 
				 
			
 
				-                reason = "Type mismatch if field {}".format(column)
			
 
				+                reason = "Type mismatch in field {}".format(column)
			
 
				 
			
 
				                 data = self._filter_invalid_data(data=data,
			
 
				                                                  invalid_mask=invalid_mask,
			
@@ -466,12 +489,12 @@ class MigrationCleaning:
 
				 
			
 
				             invalid_mask = (~data[column].astype(str).str.match(pattern))
			
 
				 
			
 
				-            reason = "Pattern mismatch in field {}".format(column)
			
 
				+            reason = "Pattern mismatch in field {0}. Pattern: {1}Example: {2}"\
			
 
				+                    .format(column,pattern,data.iloc[0][column])
			
 
				 
			
 
				             data = self._filter_invalid_data(data=data,
			
 
				                                              invalid_mask=invalid_mask,
			
 
				                                              reason=reason)
			
 
				-
			
 
				         return data
			
 
				 
			
 
				     def filter_invalid_values(self, data: pd.DataFrame) -> pd.DataFrame:
			
@@ -523,14 +546,36 @@ class MigrationCleaning:
 
				 
			
 
				         return data
			
 
				 
			
 
				+    def drop_columns_with_no_content(self, data: pd.DataFrame) -> pd.DataFrame():
			
 
				+        '''
			
 
				+        '''
			
 
				+        data = data.dropna(how ='all', axis='columns')
			
 
				+        for column in data.columns:
			
 
				+            unique_values = data[column].unique()
			
 
				+            no_content_signs = [None, '-', 'n.a']
			
 
				+            intersection = list(set(unique_values) & set(no_content_signs))
			
 
				+            if len(intersection) - len(unique_values) == 0:
			
 
				+                data = data.drop(columns=[column])
			
 
				+
			
 
				+        return data
			
 
				+
			
 
				+    def clean_json_from_None_object(self, data: pd.DataFrame) -> pd.DataFrame():
			
 
				+        data = data.to_json(date_format="iso")
			
 
				+        data = json.loads(data)
			
 
				+        new_data = remap(data, lambda p, k, v: v is not None)
			
 
				+        new_data = remap(new_data, lambda p, k, v: v != 'None')
			
 
				+        new_data = remap(new_data, lambda p, k, v: v != 'inf')
			
 
				+        new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
			
 
				+        return new_data
			
 
				+
			
 
				     def restrict_to_collection(self, data: pd.DataFrame, collection_name: str) -> pd.DataFrame:
			
 
				         '''
			
 
				         '''
			
 
				         mongo_fields = self._schema_parser.get_fields_restricted_to_collection(collection_name=collection_name)
			
 
				 
			
 
				-        fields = self._mapping_parser.get_fields_restricted_to_collecton(collection_name=collection_name)
			
 
				+        mapping_fields = self._mapping_parser.get_fields_restricted_to_collection(collection_name=collection_name)
			
 
				 
			
 
				-        return data[[c for c in data.columns if (c in fields) or (c in mongo_fields)]]
			
 
				+        return data[[c for c in data.columns if (c in mapping_fields) or (c in mongo_fields)]]
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
@@ -550,7 +595,7 @@ if __name__ == "__main__":
 
				     if all([os.path.isfile(p) for p in schema_paths + [mapping_path]]):
			
 
				 
			
 
				         cleaner = MigrationCleaning(
			
 
				-                mapping_path=mapping_path,
			
 
				+                mapping_paths=mapping_path,
			
 
				                 schema_paths=schema_paths,
			
 
				                 mapping_source="internal_name",
			
 
				                 mapping_target="mongo_name",
			
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
@@ -113,7 +113,6 @@ class ParseJsonSchema(ParseDbSchema):
 
				         '''
			
 
				         mongo_types = self.get_mongo_types()
			
 
				         python_types = {}
			
 
				-
			
 
				         bson_to_python_types = {"double": float,
			
 
				                                 "decimal": float,
			
 
				                                 "string": str,
			
@@ -199,7 +198,7 @@ class ParseJsonSchema(ParseDbSchema):
 
				         result = self._parse_one(schema=schemas[0],
			
 
				                                  field_info=field_info,
			
 
				                                  required_only=required_only)
			
 
				-
			
 
				+        
			
 
				         for schema in schemas[1:]:
			
 
				             
			
 
				             next_result = self._parse_one(schema=schema,
			
--- a/cdplib/db_migration/ParseMapping.py
+++ b/cdplib/db_migration/ParseMapping.py
@@ -9,39 +9,72 @@ Created on Fri Sep 20 15:33:17 2019
 
				 import os
			
 
				 import sys
			
 
				 import numpy as np
			
 
				+import json
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
 
				 class ParseMapping:
			
 
				     '''
			
 
				     '''
			
 
				-    def __init__(self, mapping_path: str, log_name: str = "ParseMapping",
			
 
				+    def __init__(self, mapping_paths: (str, list), log_name: str = "ParseMapping",
			
 
				                  source: str = "original_name", target: str = "mongo_name",
			
 
				-                 target_collection: str = "mongo_collection"):
			
 
				+                 target_collections: str = "mongo_collection"):
			
 
				         '''
			
 
				         '''
			
 
				-        import json
			
 
				-        from cdplib.log import Log
			
 
				 
			
 
				         self._log = Log('Parse Mapping')
			
 
				 
			
 
				-        if not os.path.isfile(mapping_path):
			
 
				-            err = "Mapping not found "+mapping_path
			
 
				-            self._log.error(err)
			
 
				-            raise FileNotFoundError(err)
			
 
				-
			
 
				-        try:
			
 
				-            with open(mapping_path, "r") as f:
			
 
				-                self._mapping = json.load(f)
			
 
				+        assert(isinstance(mapping_paths, (list, str))),\
			
 
				+            "Mapping_paths must be either str or lists"
			
 
				 
			
 
				-        except Exception as e:
			
 
				-            err = ("Could not load mapping. " + mapping_path +
			
 
				-                   "Exit with error {}".format(e))
			
 
				-            self._log.error(err)
			
 
				-            raise Exception(err)
			
 
				+        if isinstance(mapping_paths, str):
			
 
				+            mapping_paths = [mapping_paths]
			
 
				 
			
 
				+        self._mapping_paths =  mapping_paths
			
 
				         self._source = source
			
 
				         self._target = target
			
 
				-        self._target_collection = target_collection
			
 
				+        self._target_collections = target_collections
			
 
				+        self._update_mapping()
			
 
				+
			
 
				+    def _update_mapping(self):
			
 
				+        '''
			
 
				+        Since we can have multiple mappings per table we need to add them to
			
 
				+        the object. I concatenated the mapping so that we don't have to adjust
			
 
				+        all function of the class to accept also list input. The class could
			
 
				+        be adjusted to accept list or even a dictornary with the key name as
			
 
				+        name of the mapping and value the json mapping.
			
 
				+        !!! WARNING !!!!
			
 
				+        Since the mapping are just concatenated there is right now
			
 
				+        no way to ditinguish from the object itself which item belongs to which
			
 
				+        mapping file.
			
 
				+        '''
			
 
				+        mappings = []
			
 
				+
			
 
				+        for mapping_path in self._mapping_paths:
			
 
				+            try:
			
 
				+                with open(mapping_path, "r") as f:
			
 
				+                    mapping = json.load(f)
			
 
				+                mappings.append(mapping)
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = ("Could not load json schema:{1} , "
			
 
				+                       "Obtained error {0}".format(e, mapping_path))
			
 
				+
			
 
				+                self._log.error(err)
			
 
				+                raise Exception(err)
			
 
				+
			
 
				+        if len(mappings) > 1:
			
 
				+            concatenate_mapping = []
			
 
				+            for mapping in mappings:
			
 
				+                if not concatenate_mapping:
			
 
				+                    concatenate_mapping = mapping
			
 
				+                else:
			
 
				+                   concatenate_mapping.extend(mapping)
			
 
				+            self._mapping = concatenate_mapping
			
 
				+        else:
			
 
				+            self._mapping = mappings[0]
			
 
				+
			
 
				 
			
 
				     def get_field_mapping(self) -> dict:
			
 
				         '''
			
@@ -58,7 +91,7 @@ class ParseMapping:
 
				             "Invalid from field"
			
 
				 
			
 
				         return [d[self._source] for d in self._mapping
			
 
				-                if (key in d) and (d[key] == value)]
			
 
				+                if (key in d) and (value in d[key])]
			
 
				 
			
 
				     def get_required_fields(self) -> list:
			
 
				         '''
			
@@ -72,39 +105,78 @@ class ParseMapping:
 
				         return self._get_fields_satistisfying_condition(key="type",
			
 
				                                                         value="Date")
			
 
				 
			
 
				-    def get_fields_restricted_to_collecton(self, collection_name: str) -> list:
			
 
				+    def get_fields_restricted_to_collection(self, collection_name: str) -> list:
			
 
				         '''
			
 
				         '''
			
 
				-        return self._get_fields_satistisfying_condition(key=self._target_collection,
			
 
				+        return self._get_fields_satistisfying_condition(key=self._target_collections,
			
 
				                                                         value=collection_name)
			
 
				 
			
 
				-    def _get_info(self, key: str, value=None) -> dict:
			
 
				+    def _get_property_from_mapping(self, property_names: list) -> dict:
			
 
				         '''
			
 
				+        Get specified property names from migration mapping json.
			
 
				         '''
			
 
				+        assert(isinstance(property_names,list)),\
			
 
				+            "Parameter 'property_names' is not a list"
			
 
				+
			
 
				         assert(all([self._source in d for d in self._mapping])),\
			
 
				-            "Invalid from field"
			
 
				+            "Not all objects in the mapping json contain property tag " + self._source
			
 
				+
			
 
				+        result = {}
			
 
				+        for column_mapping in self._mapping:
			
 
				+            for property_name in property_names:
			
 
				+                if property_name in column_mapping and column_mapping[property_name]:
			
 
				+                 result.update({column_mapping[self._source]: column_mapping[property_name]})
			
 
				+
			
 
				+        return result
			
 
				+
			
 
				 
			
 
				-        return {d[self._source]: d[key] for d in self._mapping
			
 
				-                if (key in d) and ((value is not None)
			
 
				-                and (d[key] == value)) or (key in d)}
			
 
				 
			
 
				     def get_default_values(self) -> dict:
			
 
				         '''
			
 
				+        Get default values from migration mapping json. If more peorerty names
			
 
				+        are beeing added also add them in the unit test.
			
 
				+        '''
			
 
				+        standard_default_names=["default_values"]
			
 
				+
			
 
				+        return self._get_property_from_mapping(standard_default_names)
			
 
				+
			
 
				+    def get_types(self) -> dict:
			
 
				+        '''
			
 
				+        Get type from migration mapping json. If more peorerty names
			
 
				+        are beeing added also add them in the unit test.
			
 
				+        '''
			
 
				+
			
 
				+        standard_type_names=["type"]
			
 
				+
			
 
				+        return self._get_property_from_mapping(standard_type_names)
			
 
				+
			
 
				+    def get_value_mappings(self) -> dict:
			
 
				+        '''
			
 
				+        Get type from migration mapping json. If more peorerty names
			
 
				+        are beeing added also add them in the unit test.
			
 
				         '''
			
 
				-        return self._get_info(key="default_values")
			
 
				+
			
 
				+        standard_value_mapping_names = ["value_mapping"]
			
 
				+
			
 
				+        return self._get_property_from_mapping(standard_value_mapping_names)
			
 
				 
			
 
				     def get_date_formats(self) -> dict:
			
 
				         '''
			
 
				+        Get date fromats from migration mapping json. If more peorerty names
			
 
				+        are beeing added or value also add them in the unit test.
			
 
				         '''
			
 
				-        return self._get_info(key="date_format")
			
 
				+        
			
 
				+        standard_date_format_names = ["date_format"]
			
 
				+
			
 
				+        return self._get_property_from_mapping(standard_date_format_names)
			
 
				     
			
 
				     def get_internal_names(self) -> dict:
			
 
				         '''
			
 
				         '''
			
 
				- 
			
 
				+
			
 
				         if all(["internal_name" in d for d in self._mapping]):
			
 
				             internal_names = [d["internal_name"] for d in self._mapping]
			
 
				-    
			
 
				+
			
 
				         elif all(["internal_name" not in d for d in self._mapping]):
			
 
				             internal_names = list(range(len(self._mapping)))
			
 
				 
			
@@ -134,10 +206,7 @@ class ParseMapping:
 
				 
			
 
				         return mongo_names
			
 
				 
			
 
				-    def get_types(self) -> dict:
			
 
				-        '''
			
 
				-        '''
			
 
				-        return self._get_info(key="type")
			
 
				+
			
 
				 
			
 
				     def get_python_types(self) -> dict:
			
 
				         '''
			
@@ -153,10 +222,7 @@ class ParseMapping:
 
				 
			
 
				         return {k: sql_to_python_dtypes[v] for k, v in sql_types.items()}
			
 
				 
			
 
				-    def get_value_mappings(self) -> dict:
			
 
				-        '''
			
 
				-        '''
			
 
				-        return self._get_info(key="value_mapping")
			
 
				+
			
 
				 
			
 
				     def get_column_numbers(self) -> list:
			
 
				         '''
			
@@ -178,7 +244,7 @@ class ParseMapping:
 
				 
			
 
				 if __name__ == "__main__":
			
 
				 
			
 
				-    mapping_path = os.path.join(".", "migration_mappings", "rs0_mapping.json")
			
 
				+    mapping_path = os.path.join(".", "migration_mappings", "unit_test_migration_mapping.json")
			
 
				 
			
 
				     if os.path.isfile(mapping_path):
			
 
				 
			
@@ -187,14 +253,19 @@ if __name__ == "__main__":
 
				         parser = ParseMapping(mapping_path, source="internal_name",
			
 
				                               target="mongo_name")
			
 
				 
			
 
				-        internal_to_mongo_mapping = parser.get_field_mapping()
			
 
				-
			
 
				-        original_to_internal_mapping = parser.get_field_mapping()
			
 
				-
			
 
				         default_values = parser.get_default_values()
			
 
				-
			
 
				+        print(default_values)
			
 
				+        date_formats = parser.get_date_formats()
			
 
				+        print(date_formats)
			
 
				+        mongo_names = parser.get_mongo_names()
			
 
				+        print(mongo_names)
			
 
				         types = parser.get_types()
			
 
				-
			
 
				+        print(types)
			
 
				         column_numbers = parser.get_column_numbers()
			
 
				+        print(column_numbers)
			
 
				+        value_mappings = parser.get_value_mappings()
			
 
				+        print(value_mappings)
			
 
				+        date_formats = parser.get_date_formats()
			
 
				+        print(date_formats)
			
 
				 
			
 
				         print("Done testing!")
			
--- a/setup.py
+++ b/setup.py
@@ -1,17 +1,18 @@
 
				 from setuptools import setup,find_packages
			
 
				 
			
 
				-INSTALL_REQUIRES = [    
			
 
				+INSTALL_REQUIRES = [
			
 
				         'pandas',
			
 
				         'sqlalchemy',
			
 
				         'sqlparse',
			
 
				-        'pymysql',      
			
 
				+        'pymysql',
			
 
				         'pymongo',
			
 
				-        'jsonref', 
			
 
				+        'jsonref',
			
 
				         'simplejson',
			
 
				         'mysql',
			
 
				         'sqlalchemy_utils',
			
 
				         'sklearn',
			
 
				         'hyperopt',
			
 
				+        'boltons'
			
 
				 ]