#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 27 16:20:03 2019 @author: tanya """ import pandas as pd import numpy as np from typing import Union, List class CleaningUtils: ''' Unites different methods for data cleaning ''' def convert_dates(self, series: pd.Series, formats: Union[str, List[str]]) -> pd.Series: ''' Converts values from string to date in a pandas Series where possibly multiple date formats are mixed ''' formats = list(formats) converted = pd.Series([pd.to_datetime(np.nan)]*len(series)) for formt in formats: if formt == "%d%m%Y": missing_leading_zero = (series.astype(str).str.len() == 7) series = series.astype(str) series.loc[missing_leading_zero] += "0" converted_this_format = pd.to_datetime(series, format=formt, errors="coerce") converted.fillna(converted_this_format, inplace=True) return converted def standarize_writing(self, s: str, to_lowercase: bool = True) -> str: ''' Cleans a string: replaces german letters by their utf-8 encoding, replaces all non-letter characters by underscore, converts to lowercase. Used for standarizing names, for example, before writing to a database. ''' import re german_character_mapping = {"ß": "ss", "ü": "ue", "Ü": "Ue", "ä": "ae", "Ä": "Ae", "ö": "oe", "Ö": "Oe"} s = s.encode('raw_unicode_escape').decode('raw_unicode_escape') for char, correct_char in german_character_mapping.items(): s = s.replace(char, correct_char) if to_lowercase: s = s.lower() s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_") return s def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame: ''' If a dataframe has multiple columns with the same name (up to a prefix or a suffix), melts the columns together in one :parame suffix: string or regex up to which we consider names as duplicated :parame prefix: string or rege up to which we consider names as duplicated ''' from collections import Counter import re # remove the suffix and the prefix from the column names # (now the duplicates are truely duplicates) df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns] column_counter = Counter(df.columns) id_vars = [c for c in column_counter if column_counter[c] == 1] dup_vars = [c for c in column_counter if column_counter[c] > 1] if len(dup_vars) == 0: return df else: df_melted = [] for dup_var in dup_vars: dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\ .set_index(id_vars)[dup_var] df_melted.append(dup_var_melted) return pd.concat(df_melted, axis=1, sort=False).reset_index()