CleaningUtils.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Sep 27 16:20:03 2019
  5. @author: tanya
  6. """
  7. import pandas as pd
  8. import numpy as np
  9. from typing import Union, List
  10. class CleaningUtils:
  11. '''
  12. Unites different methods for data cleaning
  13. '''
  14. def convert_dates(self,
  15. series: pd.Series,
  16. formats: Union[str, List[str]]) -> pd.Series:
  17. '''
  18. Converts values from string to date in a pandas Series
  19. where possibly multiple date formats are mixed
  20. '''
  21. formats = list(formats)
  22. converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
  23. for formt in formats:
  24. if formt == "%d%m%Y":
  25. missing_leading_zero = (series.astype(str).str.len() == 7)
  26. series = series.astype(str)
  27. series.loc[missing_leading_zero] += "0"
  28. converted_this_format = pd.to_datetime(series,
  29. format=formt,
  30. errors="coerce")
  31. converted.fillna(converted_this_format, inplace=True)
  32. return converted
  33. def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
  34. '''
  35. Cleans a string: replaces german letters by their utf-8 encoding,
  36. replaces all non-letter characters by underscore,
  37. converts to lowercase.
  38. Used for standarizing names, for example, before writing
  39. to a database.
  40. '''
  41. import re
  42. german_character_mapping = {"ß": "ss",
  43. "ü": "ue",
  44. "Ü": "Ue",
  45. "ä": "ae",
  46. "Ä": "Ae",
  47. "ö": "oe",
  48. "Ö": "Oe"}
  49. s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
  50. for char, correct_char in german_character_mapping.items():
  51. s = s.replace(char, correct_char)
  52. if to_lowercase:
  53. s = s.lower()
  54. s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_")
  55. return s
  56. def melt_duplicated_columns(self, df: pd.DataFrame,
  57. suffix: str = "",
  58. prefix: str = "") -> pd.DataFrame:
  59. '''
  60. If a dataframe has multiple columns with the same name
  61. (up to a prefix or a suffix),
  62. melts the columns together in one
  63. :parame suffix: string or regex up
  64. to which we consider names as duplicated
  65. :parame prefix: string or rege
  66. up to which we consider names as duplicated
  67. '''
  68. from collections import Counter
  69. import re
  70. # remove the suffix and the prefix from the column names
  71. # (now the duplicates are truely duplicates)
  72. df.columns = [re.sub(re.compile(prefix), "",
  73. re.sub(re.compile(suffix), "", c))
  74. for c in df.columns]
  75. column_counter = Counter(df.columns)
  76. id_vars = [c for c in column_counter if column_counter[c] == 1]
  77. dup_vars = [c for c in column_counter if column_counter[c] > 1]
  78. if len(dup_vars) == 0:
  79. return df
  80. else:
  81. df_melted = []
  82. for dup_var in dup_vars:
  83. dup_var_melted = pd.melt(frame=df,
  84. id_vars=id_vars,
  85. value_vars=[dup_var],
  86. value_name=dup_var)\
  87. .set_index(id_vars)[dup_var]
  88. df_melted.append(dup_var_melted)
  89. return pd.concat(df_melted, axis=1, sort=False).reset_index()