Browse Source

Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib into oskar-dev

ogert 4 years ago
parent
commit
a9e8716ec6

+ 10 - 0
README.md

@@ -2,6 +2,16 @@
 
 
 Install cdplib via pipenv: `pipenv install -e git+https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git#egg=cdplib  `
 Install cdplib via pipenv: `pipenv install -e git+https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git#egg=cdplib  `
 
 
+
+#### Install via Pipfile
+
+* To install the master branch: Add to the \[packages\] section of the Pipfile the line:
+cdplib = {editable = true, git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
+
+* To install any other branch:
+cdplib = {editable = true, ref = "BRANCH", git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
+
+
 ### Adding new Features/Folders/Packages
 ### Adding new Features/Folders/Packages
 How to set up a new Package: 
 How to set up a new Package: 
 
 

+ 1 - 1
cdplib/db_handlers/SQLHandler.py

@@ -382,7 +382,7 @@ class SQLHandler:
 
 
     def get_column_names(self, tablename: str,
     def get_column_names(self, tablename: str,
                          schema: str = None,
                          schema: str = None,
-                         query: str = None):
+                         query: str = None) -> list:
         '''
         '''
         Tries to retrieve column information from database with given query.
         Tries to retrieve column information from database with given query.
         If this does not work, tries to select one row from the given table.
         If this does not work, tries to select one row from the given table.

+ 28 - 36
cdplib/db_migration/DataFrameToCollection.py

@@ -71,16 +71,16 @@ class DataFrameToCollection():
         '''
         '''
 
 
         from copy import deepcopy
         from copy import deepcopy
-        
+
         data = self._melt_duplicated_columns(data)
         data = self._melt_duplicated_columns(data)
-        
+
         reshaped_fields = []
         reshaped_fields = []
 
 
         if schema is None:
         if schema is None:
             schema = self.schema
             schema = self.schema
 
 
         for field in schema["properties"]:
         for field in schema["properties"]:
-            
+
             if field not in self._unroll_nested_names(data.columns):
             if field not in self._unroll_nested_names(data.columns):
                 continue
                 continue
 
 
@@ -90,10 +90,10 @@ class DataFrameToCollection():
             if field_type not in ["array", "object"]:
             if field_type not in ["array", "object"]:
 
 
                 grp_fields = [c for c in grp_fields if c in data.columns]
                 grp_fields = [c for c in grp_fields if c in data.columns]
-                
+
                 # check that there is only one possible value of this field
                 # check that there is only one possible value of this field
                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
-                
+
                 # n_distinct_valus can be 0 if the column only contains NaN values
                 # n_distinct_valus can be 0 if the column only contains NaN values
                 if n_distinct_values > 1:
                 if n_distinct_values > 1:
                     err = "Field {0} is not unique with respect to {1}"\
                     err = "Field {0} is not unique with respect to {1}"\
@@ -115,30 +115,30 @@ class DataFrameToCollection():
             elif field_type == "object":
             elif field_type == "object":
 
 
                 sub_schema = deepcopy(schema["properties"][field])
                 sub_schema = deepcopy(schema["properties"][field])
-                
+
                 # rename sub-schema properties to match with data column names
                 # rename sub-schema properties to match with data column names
                 sub_schema["properties"] =\
                 sub_schema["properties"] =\
                     {".".join([field, k]): v for k, v
                     {".".join([field, k]): v for k, v
                      in sub_schema["properties"].items()}
                      in sub_schema["properties"].items()}
-                
+
                 sub_data = self.to_list_of_documents(
                 sub_data = self.to_list_of_documents(
                             data=data,
                             data=data,
                             schema=sub_schema,
                             schema=sub_schema,
                             grp_fields=grp_fields,
                             grp_fields=grp_fields,
                             _final_step=False)
                             _final_step=False)
-                
+
                 # Need to be checked since child elements can be empty
                 # Need to be checked since child elements can be empty
                 if sub_data is not None:
                 if sub_data is not None:
-                    
+
                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
                     reshaped_field.name = field
                     reshaped_field.name = field
-    
+
                     reshaped_fields.append(reshaped_field)
                     reshaped_fields.append(reshaped_field)
 
 
             # if field is a list of dictionaries
             # if field is a list of dictionaries
             elif field_type == "array":
             elif field_type == "array":
-             
-               
+
+
                 items_type = schema["properties"][field]["items"]["bsonType"]
                 items_type = schema["properties"][field]["items"]["bsonType"]
 
 
                 if items_type == "object":
                 if items_type == "object":
@@ -149,12 +149,12 @@ class DataFrameToCollection():
                     sub_schema["properties"] =\
                     sub_schema["properties"] =\
                         {".".join([field, k]): v for k, v in
                         {".".join([field, k]): v for k, v in
                          sub_schema["properties"].items()}
                          sub_schema["properties"].items()}
-                    
+
                     # extend grp fields by sub-fields of field simple types
                     # extend grp fields by sub-fields of field simple types
                     sub_grp_fields = [f for f in sub_schema["properties"]
                     sub_grp_fields = [f for f in sub_schema["properties"]
                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
                                       and (f in data.columns)]
                                       and (f in data.columns)]
-                    
+
                     if len(sub_grp_fields) == 0:
                     if len(sub_grp_fields) == 0:
                         err = ("One of the sub-keys in a list of documents"
                         err = ("One of the sub-keys in a list of documents"
                                " must be of simple type for the field {}"
                                " must be of simple type for the field {}"
@@ -162,19 +162,19 @@ class DataFrameToCollection():
 
 
                         self._log.error(err)
                         self._log.error(err)
                         raise Exception(err)
                         raise Exception(err)
-                    
+
                     # group and reshape sub-fields with complex types
                     # group and reshape sub-fields with complex types
                     sub_data = self.to_list_of_documents(
                     sub_data = self.to_list_of_documents(
                                 data=data,
                                 data=data,
                                 schema=sub_schema,
                                 schema=sub_schema,
                                 grp_fields=grp_fields + sub_grp_fields,
                                 grp_fields=grp_fields + sub_grp_fields,
                                 _final_step=False)
                                 _final_step=False)
-                    
+
                     if sub_data is not None:
                     if sub_data is not None:
-                        
+
                         # gether the results into a list of dictionaries
                         # gether the results into a list of dictionaries
                         sub_data = sub_data.apply(self._make_dict, axis=1)
                         sub_data = sub_data.apply(self._make_dict, axis=1)
-                        
+
                         sub_data.name = field
                         sub_data.name = field
                         sub_data = sub_data.reset_index(grp_fields)
                         sub_data = sub_data.reset_index(grp_fields)
                         ######################################################
                         ######################################################
@@ -184,8 +184,8 @@ class DataFrameToCollection():
                                     .apply(self._make_list_of_distinct)
                                     .apply(self._make_list_of_distinct)
                         ######################################################
                         ######################################################
                         reshaped_fields.append(reshaped_field)
                         reshaped_fields.append(reshaped_field)
-                        
-                    
+
+
                 # if field is a list of values with simple type
                 # if field is a list of values with simple type
                 elif items_type == "array":
                 elif items_type == "array":
                     grp_fields = [c for c in grp_fields if c in data.columns]
                     grp_fields = [c for c in grp_fields if c in data.columns]
@@ -208,9 +208,9 @@ class DataFrameToCollection():
                         reshaped_fields.append(reshaped_field)
                         reshaped_fields.append(reshaped_field)
 
 
         if len(reshaped_fields) > 0:
         if len(reshaped_fields) > 0:
-            
+
             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
-            
+
             if _final_step:
             if _final_step:
                 # dropping the index names if it is the final step,
                 # dropping the index names if it is the final step,
                 # if not the index is needed for merging
                 # if not the index is needed for merging
@@ -219,7 +219,7 @@ class DataFrameToCollection():
                                    .reset_index(drop=False)
                                    .reset_index(drop=False)
 
 
                 self._log.info("Done reshaping the dataframe to a list of documents")
                 self._log.info("Done reshaping the dataframe to a list of documents")
-                
+
             return reshaped_fields
             return reshaped_fields
 
 
         else:
         else:
@@ -272,8 +272,8 @@ class DataFrameToCollection():
          entries are arbitrary objects
          entries are arbitrary objects
          (pandas unique() method does not work if entries are of complex types)
          (pandas unique() method does not work if entries are of complex types)
         '''
         '''
-            
-    
+
+
         if x.size == 1:
         if x.size == 1:
             uniques = x.tolist()
             uniques = x.tolist()
             '''
             '''
@@ -287,15 +287,15 @@ class DataFrameToCollection():
                         .assign(temp_str=lambda y: y["temp"].astype(np.str))\
                         .assign(temp_str=lambda y: y["temp"].astype(np.str))\
                         .drop_duplicates(subset=["temp_str"])\
                         .drop_duplicates(subset=["temp_str"])\
                         .drop("temp_str", axis=1).iloc[:, 0].tolist()
                         .drop("temp_str", axis=1).iloc[:, 0].tolist()
-        
-    
+
+
         def is_empty(y):
         def is_empty(y):
             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
             is_empty_list = (isinstance(y, list) and (len(y) == 0))
             is_empty_list = (isinstance(y, list) and (len(y) == 0))
             return is_empty_dict or is_empty_list
             return is_empty_dict or is_empty_list
 
 
         return [el for el in uniques if not is_empty(el)]
         return [el for el in uniques if not is_empty(el)]
-        
+
     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
         '''
         '''
         return: list of unique values from a Series where
         return: list of unique values from a Series where
@@ -413,11 +413,3 @@ if __name__ == "__main__":
                     data=df,
                     data=df,
                     schema=schm,
                     schema=schm,
                     grp_fields=grp_fields)
                     grp_fields=grp_fields)
-
-    
-    
-    
-    
-    
-    
-    

+ 10 - 9
cdplib/db_migration/MigrationCleaning.py

@@ -36,7 +36,7 @@ class MigrationCleaning:
                  schema_parser: type = ParseJsonSchema):
                  schema_parser: type = ParseJsonSchema):
         '''
         '''
         '''
         '''
-        self._log = Log('Migration Cleaning')
+        self.log = Log('Migration Cleaning')
         self._exception_handler = ExceptionsHandler()
         self._exception_handler = ExceptionsHandler()
 
 
         assert isinstance(inconsist_report_table, str),\
         assert isinstance(inconsist_report_table, str),\
@@ -141,7 +141,7 @@ class MigrationCleaning:
                     (target_field in target_types) and\
                     (target_field in target_types) and\
                     (target_types[target_field] != source_types[source_field]):
                     (target_types[target_field] != source_types[source_field]):
 
 
-                self.log_and_raise(("Type {0} of field {1} "
+                self.log.log_and_raise_error(("Type {0} of field {1} "
                                     "in schema does not match "
                                     "in schema does not match "
                                     "type {2} of field {3} in "
                                     "type {2} of field {3} in "
                                     "migration mapping")
                                     "migration mapping")
@@ -246,7 +246,7 @@ class MigrationCleaning:
         del data_inconsist
         del data_inconsist
         gc.collect()
         gc.collect()
 
 
-        self._log.warning(("Filtering: {0} ."
+        self.log.warning(("Filtering: {0} ."
                            "Filtered {1} rows "
                            "Filtered {1} rows "
                            "and {2} instances"
                            "and {2} instances"
                            .format(reason, n_rows_filtered, n_instances_filtered)))
                            .format(reason, n_rows_filtered, n_instances_filtered)))
@@ -312,11 +312,11 @@ class MigrationCleaning:
 
 
             except Exception as e:
             except Exception as e:
 
 
-                self._exception_handler.log_and_raise(("Failed to replace {0} values "
+               self.log.log_and_raise_error(("Failed to replace {0} values "
                                     "in {1}. Exit with error {2}"
                                     "in {1}. Exit with error {2}"
                                     .format(default_str, column, e)))
                                     .format(default_str, column, e)))
 
 
-        self._log.info("Replaced {} values".format(default_str))
+        self.log.info("Replaced {} values".format(default_str))
 
 
         return data
         return data
 
 
@@ -349,7 +349,7 @@ class MigrationCleaning:
 
 
                 elif (python_type == int) and data[column].isnull().any():
                 elif (python_type == int) and data[column].isnull().any():
 
 
-                    self.log_and_raise(("Column {} contains missing values "
+                    self.log.log_and_raise_error(("Column {} contains missing values "
                                         "and cannot be of integer type"
                                         "and cannot be of integer type"
                                         .format(column)))
                                         .format(column)))
 
 
@@ -364,7 +364,7 @@ class MigrationCleaning:
 
 
                 if data[column].dtype != python_type:
                 if data[column].dtype != python_type:
 
 
-                    self._log.warning(("After conversion type in {0} "
+                    self.log.warning(("After conversion type in {0} "
                                        "should be {1} "
                                        "should be {1} "
                                        "but is still {2}"
                                        "but is still {2}"
                                        .format(column,
                                        .format(column,
@@ -373,11 +373,11 @@ class MigrationCleaning:
 
 
             except Exception as e:
             except Exception as e:
 
 
-                self._exception_handler.log_and_raise(("Failed to convert types in {0}. "
+                self.log.log_and_raise_error(("Failed to convert types in {0}. "
                                     "Exit with error {1}"
                                     "Exit with error {1}"
                                     .format(column, e)))
                                     .format(column, e)))
 
 
-        self._log.info("Converted dtypes")
+        self.log.info("Converted dtypes")
 
 
         return data
         return data
 
 
@@ -559,3 +559,4 @@ if __name__ == "__main__":
         data = cleaner.filter_notallowed_values(data)
         data = cleaner.filter_notallowed_values(data)
 
 
     print("Done!")
     print("Done!")
+    

+ 5 - 2
cdplib/db_migration/ParseJsonSchema.py

@@ -49,7 +49,9 @@ class ParseJsonSchema(ParseDbSchema):
         for schema_path in schema_paths:
         for schema_path in schema_paths:
             try:
             try:
                 with open(schema_path, "r") as f:
                 with open(schema_path, "r") as f:
-                    self.schemas.append(json.load(f))
+                   schema = json.load(f) 
+                # Load schmea dereferenced and cleaned by default values
+                self.schemas.append(self._dereference_schema(schema))
 
 
             except Exception as e:
             except Exception as e:
                 err = ("Could not load json schema, "
                 err = ("Could not load json schema, "
@@ -399,7 +401,7 @@ class ParseJsonSchema(ParseDbSchema):
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
 
 
-    # Only for testing
+#     Only for testing
 
 
     schema_path = os.path.join(".", "mongo_schema", "schema_components.json")
     schema_path = os.path.join(".", "mongo_schema", "schema_components.json")
     
     
@@ -422,3 +424,4 @@ if __name__ == "__main__":
         allowed_values = parse_obj.get_allowed_values()
         allowed_values = parse_obj.get_allowed_values()
 
 
         descriptions = parse_obj.get_field_descriptions()
         descriptions = parse_obj.get_field_descriptions()
+    

+ 47 - 1
cdplib/utils/CleaningUtils.py

@@ -12,9 +12,12 @@ import numpy as np
 
 
 class CleaningUtils:
 class CleaningUtils:
     '''
     '''
+    Unites different methods for data cleaning
     '''
     '''
     def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
     def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
         '''
         '''
+        Converts values from string to date in a pandas Series
+         where possibly multiple date formats are mixed
         '''
         '''
         formats = list(formats)
         formats = list(formats)
 
 
@@ -37,8 +40,14 @@ class CleaningUtils:
 
 
         return converted
         return converted
 
 
-    def standarize_writing(self, s: str, to_lowercase: bool = True):
+    def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
         '''
         '''
+         Cleans a string: replaces german letters by their utf-8 encoding,
+          replaces all non-letter characters by underscore,
+          converts to lowercase.
+
+          Used for standarizing names, for example, before writing
+          to a database.
         '''
         '''
         import re
         import re
 
 
@@ -51,6 +60,7 @@ class CleaningUtils:
                                     "Ö": "Oe"}
                                     "Ö": "Oe"}
 
 
         s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
         s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
+
         for char, correct_char in german_character_mapping.items():
         for char, correct_char in german_character_mapping.items():
             s = s.replace(char, correct_char)
             s = s.replace(char, correct_char)
 
 
@@ -61,3 +71,39 @@ class CleaningUtils:
 
 
         return s
         return s
 
 
+    def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
+        '''
+        If a dataframe has multiple columns with the same name
+         (up to a prefix or a suffix),
+         melts the columns together in one
+
+        :parame suffix: string or regex up to which we consider names as duplicated
+        :parame prefix: string or regex up to which we consider names as duplicated
+        '''
+        from collections import Counter
+
+        import re
+
+        # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
+        df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
+
+        column_counter = Counter(df.columns)
+
+        id_vars = [c for c in column_counter if column_counter[c] == 1]
+
+        dup_vars = [c for c in column_counter if column_counter[c] > 1]
+
+        if len(dup_vars) == 0:
+            return df
+
+        else:
+            df_melted = []
+
+            for dup_var in dup_vars:
+                dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
+                                   .set_index(id_vars)[dup_var]
+
+                df_melted.append(dup_var_melted)
+
+            return pd.concat(df_melted, axis=1, sort=False).reset_index()
+

+ 1 - 1
setup.py

@@ -26,7 +26,7 @@ INSTALL_REQUIRES = [
         'xeger',
         'xeger',
         'simplejson',
         'simplejson',
         'mysql',
         'mysql',
-        'sqlalchemy-utils'
+        'sqlalchemy-utils',
 ]
 ]