|
@@ -12,9 +12,12 @@ import numpy as np
|
|
|
|
|
|
class CleaningUtils:
|
|
|
'''
|
|
|
+ Unites different methods for data cleaning
|
|
|
'''
|
|
|
def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
|
|
|
'''
|
|
|
+ Converts values from string to date in a pandas Series
|
|
|
+ where possibly multiple date formats are mixed
|
|
|
'''
|
|
|
formats = list(formats)
|
|
|
|
|
@@ -37,8 +40,14 @@ class CleaningUtils:
|
|
|
|
|
|
return converted
|
|
|
|
|
|
- def standarize_writing(self, s: str, to_lowercase: bool = True):
|
|
|
+ def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
|
|
|
'''
|
|
|
+ Cleans a string: replaces german letters by their utf-8 encoding,
|
|
|
+ replaces all non-letter characters by underscore,
|
|
|
+ converts to lowercase.
|
|
|
+
|
|
|
+ Used for standarizing names, for example, before writing
|
|
|
+ to a database.
|
|
|
'''
|
|
|
import re
|
|
|
|
|
@@ -51,6 +60,7 @@ class CleaningUtils:
|
|
|
"Ö": "Oe"}
|
|
|
|
|
|
s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
|
|
|
+
|
|
|
for char, correct_char in german_character_mapping.items():
|
|
|
s = s.replace(char, correct_char)
|
|
|
|
|
@@ -61,3 +71,39 @@ class CleaningUtils:
|
|
|
|
|
|
return s
|
|
|
|
|
|
+ def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
|
|
|
+ '''
|
|
|
+ If a dataframe has multiple columns with the same name
|
|
|
+ (up to a prefix or a suffix),
|
|
|
+ melts the columns together in one
|
|
|
+
|
|
|
+ :parame suffix: string or regex up to which we consider names as duplicated
|
|
|
+ :parame prefix: string or regex up to which we consider names as duplicated
|
|
|
+ '''
|
|
|
+ from collections import Counter
|
|
|
+
|
|
|
+ import re
|
|
|
+
|
|
|
+ # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
|
|
|
+ df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
|
|
|
+
|
|
|
+ column_counter = Counter(df.columns)
|
|
|
+
|
|
|
+ id_vars = [c for c in column_counter if column_counter[c] == 1]
|
|
|
+
|
|
|
+ dup_vars = [c for c in column_counter if column_counter[c] > 1]
|
|
|
+
|
|
|
+ if len(dup_vars) == 0:
|
|
|
+ return df
|
|
|
+
|
|
|
+ else:
|
|
|
+ df_melted = []
|
|
|
+
|
|
|
+ for dup_var in dup_vars:
|
|
|
+ dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_vars], value_name=dup_var)\
|
|
|
+ .set_index(id_vars)[dup_var]
|
|
|
+
|
|
|
+ df_melted.append(dup_var_melted)
|
|
|
+
|
|
|
+ return pd.concat(df_melted, axis=1, sort=False).reset_index()
|
|
|
+
|