Browse Source

added a new method to CleaningUtils

tanja 5 years ago
parent
commit
f2a8a4e720
2 changed files with 48 additions and 2 deletions
  1. 1 1
      cdplib/db_handlers/SQLHandler.py
  2. 47 1
      cdplib/utils/CleaningUtils.py

+ 1 - 1
cdplib/db_handlers/SQLHandler.py

@@ -378,7 +378,7 @@ class SQLHandler:
 
 
     def get_column_names(self, tablename: str,
     def get_column_names(self, tablename: str,
                          schema: str = None,
                          schema: str = None,
-                         query: str = None):
+                         query: str = None) -> list:
         '''
         '''
         Tries to retrieve column information from database with given query.
         Tries to retrieve column information from database with given query.
         If this does not work, tries to select one row from the given table.
         If this does not work, tries to select one row from the given table.

+ 47 - 1
cdplib/utils/CleaningUtils.py

@@ -12,9 +12,12 @@ import numpy as np
 
 
 class CleaningUtils:
 class CleaningUtils:
     '''
     '''
+    Unites different methods for data cleaning
     '''
     '''
     def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
     def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
         '''
         '''
+        Converts values from string to date in a pandas Series
+         where possibly multiple date formats are mixed
         '''
         '''
         formats = list(formats)
         formats = list(formats)
 
 
@@ -37,8 +40,14 @@ class CleaningUtils:
 
 
         return converted
         return converted
 
 
-    def standarize_writing(self, s: str, to_lowercase: bool = True):
+    def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
         '''
         '''
+         Cleans a string: replaces german letters by their utf-8 encoding,
+          replaces all non-letter characters by underscore,
+          converts to lowercase.
+
+          Used for standarizing names, for example, before writing
+          to a database.
         '''
         '''
         import re
         import re
 
 
@@ -51,6 +60,7 @@ class CleaningUtils:
                                     "Ö": "Oe"}
                                     "Ö": "Oe"}
 
 
         s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
         s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
+
         for char, correct_char in german_character_mapping.items():
         for char, correct_char in german_character_mapping.items():
             s = s.replace(char, correct_char)
             s = s.replace(char, correct_char)
 
 
@@ -61,3 +71,39 @@ class CleaningUtils:
 
 
         return s
         return s
 
 
+    def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
+        '''
+        If a dataframe has multiple columns with the same name
+         (up to a prefix or a suffix),
+         melts the columns together in one
+
+        :parame suffix: string or regex up to which we consider names as duplicated
+        :parame prefix: string or regex up to which we consider names as duplicated
+        '''
+        from collections import Counter
+
+        import re
+
+        # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
+        df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
+
+        column_counter = Counter(df.columns)
+
+        id_vars = [c for c in column_counter if column_counter[c] == 1]
+
+        dup_vars = [c for c in column_counter if column_counter[c] > 1]
+
+        if len(dup_vars) == 0:
+            return df
+
+        else:
+            df_melted = []
+
+            for dup_var in dup_vars:
+                dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_vars], value_name=dup_var)\
+                                   .set_index(id_vars)[dup_var]
+
+                df_melted.append(dup_var_melted)
+
+            return pd.concat(df_melted, axis=1, sort=False).reset_index()
+