CleaningUtils.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 27 16:20:03 2019
  5. @author: tanya
  6. """
  7. import pandas as pd
  8. import numpy as np
  9. class CleaningUtils:
  10. '''
  11. Unites different methods for data cleaning
  12. '''
  13. def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
  14. '''
  15. Converts values from string to date in a pandas Series
  16. where possibly multiple date formats are mixed
  17. '''
  18. formats = list(formats)
  19. converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
  20. for formt in formats:
  21. if formt == "%d%m%Y":
  22. missing_leading_zero = (series.astype(str).str.len() == 7)
  23. series = series.astype(str)
  24. series.loc[missing_leading_zero] = "0" +\
  25. series.loc[missing_leading_zero]
  26. converted_this_format = pd.to_datetime(series,
  27. format=formt,
  28. errors="coerce")
  29. converted.fillna(converted_this_format, inplace=True)
  30. return converted
  31. def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
  32. '''
  33. Cleans a string: replaces german letters by their utf-8 encoding,
  34. replaces all non-letter characters by underscore,
  35. converts to lowercase.
  36. Used for standarizing names, for example, before writing
  37. to a database.
  38. '''
  39. import re
  40. german_character_mapping = {"ß": "ss",
  41. "ü": "ue",
  42. "Ü": "Ue",
  43. "ä": "ae",
  44. "Ä": "Ae",
  45. "ö": "oe",
  46. "Ö": "Oe"}
  47. s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
  48. for char, correct_char in german_character_mapping.items():
  49. s = s.replace(char, correct_char)
  50. if to_lowercase:
  51. s = s.lower()
  52. s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_")
  53. return s
  54. def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
  55. '''
  56. If a dataframe has multiple columns with the same name
  57. (up to a prefix or a suffix),
  58. melts the columns together in one
  59. :parame suffix: string or regex up to which we consider names as duplicated
  60. :parame prefix: string or regex up to which we consider names as duplicated
  61. '''
  62. from collections import Counter
  63. import re
  64. # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
  65. df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
  66. column_counter = Counter(df.columns)
  67. id_vars = [c for c in column_counter if column_counter[c] == 1]
  68. dup_vars = [c for c in column_counter if column_counter[c] > 1]
  69. if len(dup_vars) == 0:
  70. return df
  71. else:
  72. df_melted = []
  73. for dup_var in dup_vars:
  74. dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
  75. .set_index(id_vars)[dup_var]
  76. df_melted.append(dup_var_melted)
  77. return pd.concat(df_melted, axis=1, sort=False).reset_index()