123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Fri Sep 27 16:20:03 2019
- @author: tanya
- """
- import pandas as pd
- import numpy as np
- class CleaningUtils:
- '''
- Unites different methods for data cleaning
- '''
- def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
- '''
- Converts values from string to date in a pandas Series
- where possibly multiple date formats are mixed
- '''
- formats = list(formats)
- converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
- for formt in formats:
- if formt == "%d%m%Y":
- missing_leading_zero = (series.astype(str).str.len() == 7)
- series = series.astype(str)
- series.loc[missing_leading_zero] = "0" +\
- series.loc[missing_leading_zero]
- converted_this_format = pd.to_datetime(series,
- format=formt,
- errors="coerce")
- converted.fillna(converted_this_format, inplace=True)
- return converted
- def standarize_writing(self, s: str, to_lowercase: bool = True) -> str:
- '''
- Cleans a string: replaces german letters by their utf-8 encoding,
- replaces all non-letter characters by underscore,
- converts to lowercase.
- Used for standarizing names, for example, before writing
- to a database.
- '''
- import re
- german_character_mapping = {"ß": "ss",
- "ü": "ue",
- "Ü": "Ue",
- "ä": "ae",
- "Ä": "Ae",
- "ö": "oe",
- "Ö": "Oe"}
- s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
- for char, correct_char in german_character_mapping.items():
- s = s.replace(char, correct_char)
- if to_lowercase:
- s = s.lower()
- s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_")
- return s
- def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
- '''
- If a dataframe has multiple columns with the same name
- (up to a prefix or a suffix),
- melts the columns together in one
- :parame suffix: string or regex up to which we consider names as duplicated
- :parame prefix: string or regex up to which we consider names as duplicated
- '''
- from collections import Counter
- import re
- # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
- df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
- column_counter = Counter(df.columns)
- id_vars = [c for c in column_counter if column_counter[c] == 1]
- dup_vars = [c for c in column_counter if column_counter[c] > 1]
- if len(dup_vars) == 0:
- return df
- else:
- df_melted = []
- for dup_var in dup_vars:
- dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
- .set_index(id_vars)[dup_var]
- df_melted.append(dup_var_melted)
- return pd.concat(df_melted, axis=1, sort=False).reset_index()
|