CleaningUtils.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 27 16:20:03 2019
  5. @author: tanya
  6. """
  7. import pandas as pd
  8. import numpy as np
  9. class CleaningUtils:
  10. '''
  11. '''
  12. def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
  13. '''
  14. '''
  15. formats = list(formats)
  16. converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
  17. for formt in formats:
  18. if formt == "%d%m%Y":
  19. missing_leading_zero = (series.astype(str).str.len() == 7)
  20. series = series.astype(str)
  21. series.loc[missing_leading_zero] = "0" +\
  22. series.loc[missing_leading_zero]
  23. converted_this_format = pd.to_datetime(series,
  24. format=formt,
  25. errors="coerce")
  26. converted.fillna(converted_this_format, inplace=True)
  27. return converted
  28. def standarize_writing(self, s: str):
  29. '''
  30. '''
  31. import re
  32. german_character_mapping = {"ß": "ss",
  33. "ü": "ue",
  34. "Ü": "Ue",
  35. "ä": "ae",
  36. "Ä": "Ae",
  37. "ö": "oe",
  38. "Ö": "Oe"}
  39. s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
  40. for char, correct_char in german_character_mapping.items():
  41. s = s.replace(char, correct_char)
  42. s = s.lower()
  43. s = re.sub('[^0-9a-zA-Z]+', '_', s)
  44. return s