DataCleaningUtils.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 27 16:20:03 2019
  5. @author: tanya
  6. """
  7. import pandas as pd
  8. import numpy as np
  9. class CleaningUtils:
  10. '''
  11. '''
  12. def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
  13. '''
  14. '''
  15. formats = list(formats)
  16. converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
  17. for formt in formats:
  18. if formt == "%d%m%Y":
  19. missing_leading_zero = (series.astype(str).str.len() == 7)
  20. series = series.astype(str)
  21. series.loc[missing_leading_zero] = "0" +\
  22. series.loc[missing_leading_zero]
  23. converted_this_format = pd.to_datetime(series,
  24. format=formt,
  25. errors="coerce")
  26. converted.fillna(converted_this_format, inplace=True)
  27. return converted
  28. def standarize_writing(self, s: str, to_lowercase: bool = True):
  29. '''
  30. '''
  31. import re
  32. german_character_mapping = {"ß": "ss",
  33. "ü": "ue",
  34. "Ü": "Ue",
  35. "ä": "ae",
  36. "Ä": "Ae",
  37. "ö": "oe",
  38. "Ö": "Oe"}
  39. s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
  40. for char, correct_char in german_character_mapping.items():
  41. s = s.replace(char, correct_char)
  42. if to_lowercase:
  43. s = s.lower()
  44. s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_")
  45. return s