#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Sep 27 16:20:03 2019 @author: tanya """ import pandas as pd import numpy as np class CleaningUtils: ''' ''' def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series: ''' ''' formats = list(formats) converted = pd.Series([pd.to_datetime(np.nan)]*len(series)) for formt in formats: if formt == "%d%m%Y": missing_leading_zero = (series.astype(str).str.len() == 7) series = series.astype(str) series.loc[missing_leading_zero] = "0" +\ series.loc[missing_leading_zero] converted_this_format = pd.to_datetime(series, format=formt, errors="coerce") converted.fillna(converted_this_format, inplace=True) return converted def standarize_writing(self, s: str): ''' ''' import re german_character_mapping = {"ß": "ss", "ü": "ue", "Ü": "Ue", "ä": "ae", "Ä": "Ae", "ö": "oe", "Ö": "Oe"} s = s.encode('raw_unicode_escape').decode('raw_unicode_escape') for char, correct_char in german_character_mapping.items(): s = s.replace(char, correct_char) s = s.lower() s = re.sub('[^0-9a-zA-Z]+', '_', s) return s