5 年之前 · a9e8716ec6
--- a/README.md
+++ b/README.md
@@ -2,6 +2,16 @@
 
				 
			
 
				 Install cdplib via pipenv: `pipenv install -e git+https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git#egg=cdplib  `
			
 
				 
			
 
				+
			
 
				+#### Install via Pipfile
			
 
				+
			
 
				+* To install the master branch: Add to the \[packages\] section of the Pipfile the line:
			
 
				+cdplib = {editable = true, git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
			
 
				+
			
 
				+* To install any other branch:
			
 
				+cdplib = {editable = true, ref = "BRANCH", git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
			
 
				+
			
 
				+
			
 
				 ### Adding new Features/Folders/Packages
			
 
				 How to set up a new Package: 
			
 
				 
			
--- a/cdplib/db_handlers/SQLHandler.py
+++ b/cdplib/db_handlers/SQLHandler.py
@@ -382,7 +382,7 @@ class SQLHandler:
 
				 
			
 
				     def get_column_names(self, tablename: str,
			
 
				                          schema: str = None,
			
 
				-                         query: str = None):
			
 
				+                         query: str = None) -> list:
			
 
				         '''
			
 
				         Tries to retrieve column information from database with given query.
			
 
				         If this does not work, tries to select one row from the given table.
			
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
@@ -71,16 +71,16 @@ class DataFrameToCollection():
 
				         '''
			
 
				 
			
 
				         from copy import deepcopy
			
 
				-        
			
 
				+
			
 
				         data = self._melt_duplicated_columns(data)
			
 
				-        
			
 
				+
			
 
				         reshaped_fields = []
			
 
				 
			
 
				         if schema is None:
			
 
				             schema = self.schema
			
 
				 
			
 
				         for field in schema["properties"]:
			
 
				-            
			
 
				+
			
 
				             if field not in self._unroll_nested_names(data.columns):
			
 
				                 continue
			
 
				 
			
@@ -90,10 +90,10 @@ class DataFrameToCollection():
 
				             if field_type not in ["array", "object"]:
			
 
				 
			
 
				                 grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				-                
			
 
				+
			
 
				                 # check that there is only one possible value of this field
			
 
				                 n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
			
 
				-                
			
 
				+
			
 
				                 # n_distinct_valus can be 0 if the column only contains NaN values
			
 
				                 if n_distinct_values > 1:
			
 
				                     err = "Field {0} is not unique with respect to {1}"\
			
@@ -115,30 +115,30 @@ class DataFrameToCollection():
 
				             elif field_type == "object":
			
 
				 
			
 
				                 sub_schema = deepcopy(schema["properties"][field])
			
 
				-                
			
 
				+
			
 
				                 # rename sub-schema properties to match with data column names
			
 
				                 sub_schema["properties"] =\
			
 
				                     {".".join([field, k]): v for k, v
			
 
				                      in sub_schema["properties"].items()}
			
 
				-                
			
 
				+
			
 
				                 sub_data = self.to_list_of_documents(
			
 
				                             data=data,
			
 
				                             schema=sub_schema,
			
 
				                             grp_fields=grp_fields,
			
 
				                             _final_step=False)
			
 
				-                
			
 
				+
			
 
				                 # Need to be checked since child elements can be empty
			
 
				                 if sub_data is not None:
			
 
				-                    
			
 
				+
			
 
				                     reshaped_field = sub_data.apply(self._make_dict, axis=1)
			
 
				                     reshaped_field.name = field
			
 
				-    
			
 
				+
			
 
				                     reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				             # if field is a list of dictionaries
			
 
				             elif field_type == "array":
			
 
				-             
			
 
				-               
			
 
				+
			
 
				+
			
 
				                 items_type = schema["properties"][field]["items"]["bsonType"]
			
 
				 
			
 
				                 if items_type == "object":
			
@@ -149,12 +149,12 @@ class DataFrameToCollection():
 
				                     sub_schema["properties"] =\
			
 
				                         {".".join([field, k]): v for k, v in
			
 
				                          sub_schema["properties"].items()}
			
 
				-                    
			
 
				+
			
 
				                     # extend grp fields by sub-fields of field simple types
			
 
				                     sub_grp_fields = [f for f in sub_schema["properties"]
			
 
				                                       if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
			
 
				                                       and (f in data.columns)]
			
 
				-                    
			
 
				+
			
 
				                     if len(sub_grp_fields) == 0:
			
 
				                         err = ("One of the sub-keys in a list of documents"
			
 
				                                " must be of simple type for the field {}"
			
@@ -162,19 +162,19 @@ class DataFrameToCollection():
 
				 
			
 
				                         self._log.error(err)
			
 
				                         raise Exception(err)
			
 
				-                    
			
 
				+
			
 
				                     # group and reshape sub-fields with complex types
			
 
				                     sub_data = self.to_list_of_documents(
			
 
				                                 data=data,
			
 
				                                 schema=sub_schema,
			
 
				                                 grp_fields=grp_fields + sub_grp_fields,
			
 
				                                 _final_step=False)
			
 
				-                    
			
 
				+
			
 
				                     if sub_data is not None:
			
 
				-                        
			
 
				+
			
 
				                         # gether the results into a list of dictionaries
			
 
				                         sub_data = sub_data.apply(self._make_dict, axis=1)
			
 
				-                        
			
 
				+
			
 
				                         sub_data.name = field
			
 
				                         sub_data = sub_data.reset_index(grp_fields)
			
 
				                         ######################################################
			
@@ -184,8 +184,8 @@ class DataFrameToCollection():
 
				                                     .apply(self._make_list_of_distinct)
			
 
				                         ######################################################
			
 
				                         reshaped_fields.append(reshaped_field)
			
 
				-                        
			
 
				-                    
			
 
				+
			
 
				+
			
 
				                 # if field is a list of values with simple type
			
 
				                 elif items_type == "array":
			
 
				                     grp_fields = [c for c in grp_fields if c in data.columns]
			
@@ -208,9 +208,9 @@ class DataFrameToCollection():
 
				                         reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				         if len(reshaped_fields) > 0:
			
 
				-            
			
 
				+
			
 
				             reshaped_fields = pd.concat(reshaped_fields, sort=False, axis=1)
			
 
				-            
			
 
				+
			
 
				             if _final_step:
			
 
				                 # dropping the index names if it is the final step,
			
 
				                 # if not the index is needed for merging
			
@@ -219,7 +219,7 @@ class DataFrameToCollection():
 
				                                    .reset_index(drop=False)
			
 
				 
			
 
				                 self._log.info("Done reshaping the dataframe to a list of documents")
			
 
				-                
			
 
				+
			
 
				             return reshaped_fields
			
 
				 
			
 
				         else:
			
@@ -272,8 +272,8 @@ class DataFrameToCollection():
 
				          entries are arbitrary objects
			
 
				          (pandas unique() method does not work if entries are of complex types)
			
 
				         '''
			
 
				-            
			
 
				-    
			
 
				+
			
 
				+
			
 
				         if x.size == 1:
			
 
				             uniques = x.tolist()
			
 
				             '''
			
@@ -287,15 +287,15 @@ class DataFrameToCollection():
 
				                         .assign(temp_str=lambda y: y["temp"].astype(np.str))\
			
 
				                         .drop_duplicates(subset=["temp_str"])\
			
 
				                         .drop("temp_str", axis=1).iloc[:, 0].tolist()
			
 
				-        
			
 
				-    
			
 
				+
			
 
				+
			
 
				         def is_empty(y):
			
 
				             is_empty_dict = (isinstance(y, dict) and (len(y) == 0))
			
 
				             is_empty_list = (isinstance(y, list) and (len(y) == 0))
			
 
				             return is_empty_dict or is_empty_list
			
 
				 
			
 
				         return [el for el in uniques if not is_empty(el)]
			
 
				-        
			
 
				+
			
 
				     def _make_flattened_list_of_distinct(self, x: pd.Series) -> list:
			
 
				         '''
			
 
				         return: list of unique values from a Series where
			
@@ -413,11 +413,3 @@ if __name__ == "__main__":
 
				                     data=df,
			
 
				                     schema=schm,
			
 
				                     grp_fields=grp_fields)
			
 
				-
			
 
				-    
			
 
				-    
			
 
				-    
			
 
				-    
			
 
				-    
			
 
				-    
			
 
				-    
			
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -36,7 +36,7 @@ class MigrationCleaning:
 
				                  schema_parser: type = ParseJsonSchema):
			
 
				         '''
			
 
				         '''
			
 
				-        self._log = Log('Migration Cleaning')
			
 
				+        self.log = Log('Migration Cleaning')
			
 
				         self._exception_handler = ExceptionsHandler()
			
 
				 
			
 
				         assert isinstance(inconsist_report_table, str),\
			
@@ -141,7 +141,7 @@ class MigrationCleaning:
 
				                     (target_field in target_types) and\
			
 
				                     (target_types[target_field] != source_types[source_field]):
			
 
				 
			
 
				-                self.log_and_raise(("Type {0} of field {1} "
			
 
				+                self.log.log_and_raise_error(("Type {0} of field {1} "
			
 
				                                     "in schema does not match "
			
 
				                                     "type {2} of field {3} in "
			
 
				                                     "migration mapping")
			
@@ -246,7 +246,7 @@ class MigrationCleaning:
 
				         del data_inconsist
			
 
				         gc.collect()
			
 
				 
			
 
				-        self._log.warning(("Filtering: {0} ."
			
 
				+        self.log.warning(("Filtering: {0} ."
			
 
				                            "Filtered {1} rows "
			
 
				                            "and {2} instances"
			
 
				                            .format(reason, n_rows_filtered, n_instances_filtered)))
			
@@ -312,11 +312,11 @@ class MigrationCleaning:
 
				 
			
 
				             except Exception as e:
			
 
				 
			
 
				-                self._exception_handler.log_and_raise(("Failed to replace {0} values "
			
 
				+               self.log.log_and_raise_error(("Failed to replace {0} values "
			
 
				                                     "in {1}. Exit with error {2}"
			
 
				                                     .format(default_str, column, e)))
			
 
				 
			
 
				-        self._log.info("Replaced {} values".format(default_str))
			
 
				+        self.log.info("Replaced {} values".format(default_str))
			
 
				 
			
 
				         return data
			
 
				 
			
@@ -349,7 +349,7 @@ class MigrationCleaning:
 
				 
			
 
				                 elif (python_type == int) and data[column].isnull().any():
			
 
				 
			
 
				-                    self.log_and_raise(("Column {} contains missing values "
			
 
				+                    self.log.log_and_raise_error(("Column {} contains missing values "
			
 
				                                         "and cannot be of integer type"
			
 
				                                         .format(column)))
			
 
				 
			
@@ -364,7 +364,7 @@ class MigrationCleaning:
 
				 
			
 
				                 if data[column].dtype != python_type:
			
 
				 
			
 
				-                    self._log.warning(("After conversion type in {0} "
			
 
				+                    self.log.warning(("After conversion type in {0} "
			
 
				                                        "should be {1} "
			
 
				                                        "but is still {2}"
			
 
				                                        .format(column,
			
@@ -373,11 +373,11 @@ class MigrationCleaning:
 
				 
			
 
				             except Exception as e:
			
 
				 
			
 
				-                self._exception_handler.log_and_raise(("Failed to convert types in {0}. "
			
 
				+                self.log.log_and_raise_error(("Failed to convert types in {0}. "
			
 
				                                     "Exit with error {1}"
			
 
				                                     .format(column, e)))
			
 
				 
			
 
				-        self._log.info("Converted dtypes")
			
 
				+        self.log.info("Converted dtypes")
			
 
				 
			
 
				         return data
			
 
				 
			
@@ -559,3 +559,4 @@ if __name__ == "__main__":
 
				         data = cleaner.filter_notallowed_values(data)
			
 
				 
			
 
				     print("Done!")
			
 
				+    
			
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
@@ -49,7 +49,9 @@ class ParseJsonSchema(ParseDbSchema):
 
				         for schema_path in schema_paths:
			
 
				             try:
			
 
				                 with open(schema_path, "r") as f:
			
 
				-                    self.schemas.append(json.load(f))
			
 
				+                   schema = json.load(f) 
			
 
				+                # Load schmea dereferenced and cleaned by default values
			
 
				+                self.schemas.append(self._dereference_schema(schema))
			
 
				 
			
 
				             except Exception as e:
			
 
				                 err = ("Could not load json schema, "
			
@@ -399,7 +401,7 @@ class ParseJsonSchema(ParseDbSchema):
 
				 
			
 
				 if __name__ == "__main__":
			
 
				 
			
 
				-    # Only for testing
			
 
				+#     Only for testing
			
 
				 
			
 
				     schema_path = os.path.join(".", "mongo_schema", "schema_components.json")
			
 
				     
			
@@ -422,3 +424,4 @@ if __name__ == "__main__":
 
				         allowed_values = parse_obj.get_allowed_values()
			
 
				 
			
 
				         descriptions = parse_obj.get_field_descriptions()
			
 
				+    
			
--- a/cdplib/utils/CleaningUtils.py
+++ b/cdplib/utils/CleaningUtils.py
@@ -12,9 +12,12 @@ import numpy as np
 
				 
			
 
				 class CleaningUtils:
			
 
				     '''
			
 
				+    Unites different methods for data cleaning
			
 
				     '''
			
 
				     def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
			
 
				         '''
			
 
				+        Converts values from string to date in a pandas Series
			
 
				+         where possibly multiple date formats are mixed
			
 
				         '''
			
 
				         formats = list(formats)
			
 
				 
			
@@ -37,8 +40,14 @@ class CleaningUtils:
 
				 
			
 
				         return converted
			
 
				 
			
 
				-    def standarize_writing(self, s: str, to_lowercase: bool = True):
			
 
				+    def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
			
 
				         '''
			
 
				+         Cleans a string: replaces german letters by their utf-8 encoding,
			
 
				+          replaces all non-letter characters by underscore,
			
 
				+          converts to lowercase.
			
 
				+
			
 
				+          Used for standarizing names, for example, before writing
			
 
				+          to a database.
			
 
				         '''
			
 
				         import re
			
 
				 
			
@@ -51,6 +60,7 @@ class CleaningUtils:
 
				                                     "Ö": "Oe"}
			
 
				 
			
 
				         s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
			
 
				+
			
 
				         for char, correct_char in german_character_mapping.items():
			
 
				             s = s.replace(char, correct_char)
			
 
				 
			
@@ -61,3 +71,39 @@ class CleaningUtils:
 
				 
			
 
				         return s
			
 
				 
			
 
				+    def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
			
 
				+        '''
			
 
				+        If a dataframe has multiple columns with the same name
			
 
				+         (up to a prefix or a suffix),
			
 
				+         melts the columns together in one
			
 
				+
			
 
				+        :parame suffix: string or regex up to which we consider names as duplicated
			
 
				+        :parame prefix: string or regex up to which we consider names as duplicated
			
 
				+        '''
			
 
				+        from collections import Counter
			
 
				+
			
 
				+        import re
			
 
				+
			
 
				+        # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
			
 
				+        df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
			
 
				+
			
 
				+        column_counter = Counter(df.columns)
			
 
				+
			
 
				+        id_vars = [c for c in column_counter if column_counter[c] == 1]
			
 
				+
			
 
				+        dup_vars = [c for c in column_counter if column_counter[c] > 1]
			
 
				+
			
 
				+        if len(dup_vars) == 0:
			
 
				+            return df
			
 
				+
			
 
				+        else:
			
 
				+            df_melted = []
			
 
				+
			
 
				+            for dup_var in dup_vars:
			
 
				+                dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
			
 
				+                                   .set_index(id_vars)[dup_var]
			
 
				+
			
 
				+                df_melted.append(dup_var_melted)
			
 
				+
			
 
				+            return pd.concat(df_melted, axis=1, sort=False).reset_index()
			
 
				+
			
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ INSTALL_REQUIRES = [
 
				         'xeger',
			
 
				         'simplejson',
			
 
				         'mysql',
			
 
				-        'sqlalchemy-utils'
			
 
				+        'sqlalchemy-utils',
			
 
				 ]