|
@@ -0,0 +1,62 @@
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+"""
|
|
|
|
+Created on Fri Sep 27 16:20:03 2019
|
|
|
|
+
|
|
|
|
+@author: tanya
|
|
|
|
+"""
|
|
|
|
+
|
|
|
|
+import pandas as pd
|
|
|
|
+import numpy as np
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class CleaningUtils:
|
|
|
|
+ '''
|
|
|
|
+ '''
|
|
|
|
+ def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
|
|
|
|
+ '''
|
|
|
|
+ '''
|
|
|
|
+ formats = list(formats)
|
|
|
|
+
|
|
|
|
+ converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
|
|
|
|
+
|
|
|
|
+ for formt in formats:
|
|
|
|
+ if formt == "%d%m%Y":
|
|
|
|
+ missing_leading_zero = (series.astype(str).str.len() == 7)
|
|
|
|
+
|
|
|
|
+ series = series.astype(str)
|
|
|
|
+
|
|
|
|
+ series.loc[missing_leading_zero] = "0" +\
|
|
|
|
+ series.loc[missing_leading_zero]
|
|
|
|
+
|
|
|
|
+ converted_this_format = pd.to_datetime(series,
|
|
|
|
+ format=formt,
|
|
|
|
+ errors="coerce")
|
|
|
|
+
|
|
|
|
+ converted.fillna(converted_this_format, inplace=True)
|
|
|
|
+
|
|
|
|
+ return converted
|
|
|
|
+
|
|
|
|
+ def standarize_writing(self, s: str):
|
|
|
|
+ '''
|
|
|
|
+ '''
|
|
|
|
+ import re
|
|
|
|
+
|
|
|
|
+ german_character_mapping = {"ß": "ss",
|
|
|
|
+ "ü": "ue",
|
|
|
|
+ "Ü": "Ue",
|
|
|
|
+ "ä": "ae",
|
|
|
|
+ "Ä": "Ae",
|
|
|
|
+ "ö": "oe",
|
|
|
|
+ "Ö": "Oe"}
|
|
|
|
+
|
|
|
|
+ s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
|
|
|
|
+ for char, correct_char in german_character_mapping.items():
|
|
|
|
+ s = s.replace(char, correct_char)
|
|
|
|
+
|
|
|
|
+ s = s.lower()
|
|
|
|
+
|
|
|
|
+ s = re.sub('[^0-9a-zA-Z]+', '_', s)
|
|
|
|
+
|
|
|
|
+ return s
|
|
|
|
+
|