#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Sep 30 10:16:23 2019 @author: tanya """ import pandas as pd import numpy as np import os import sys sys.path.append(os.getcwd()) from libraries.import_process_instances.CleanRs1 import CleanRs1 class MergeProcessTables: ''' ''' def merge_rs2(self, data: pd.DataFrame, rs2: pd.DataFrame) -> pd.DataFrame: ''' Difficulty: rows that correspond to one radsatznummer and one station in rs1 and rs2 are in many-to-many relation and the ende_der_bearbeitung for these rows often does not match in the two tables. Rules: A) We check if the end_der_bearbeitung of an activity from rs2 is >= begin_der_bearbeitung and <= of ende_der_bearbeitung of an entry in rs1 B) If an activity (row in rs2) has ende_der_bearbeitung which is later than ende_der_bearbeitung of all the entries in rs1, we check if it ended earlier than the begin_der_bearbeitung on the next station. If it is so, we assign the activity to the latest entry in rs1 for this station. Same logic applies for merging the table rs70. ''' data = data.copy(deep=True) rs2 = rs2.copy(deep=True) station_change = (data["positionsnummer"] != data["positionsnummer"].shift(-1)) data["order"] = data.index common_columns = ["radsatznummer", "positionsnummer", "positionsname"] data = pd.merge(data, rs2, how="left", on=common_columns) start_time_next_station =\ CleanRs1()._get_next_station_start_time(data)\ .fillna(data["ende_der_bearbeitung_x"]) start_matches = (data["ende_der_bearbeitung_y"] >= data["begin_der_bearbeitung"]) end_matches = ((data["ende_der_bearbeitung_y"] <= data["ende_der_bearbeitung_x"]) | data["ende_der_bearbeitung_y"].isnull()) end_almost_matches = ((data["ende_der_bearbeitung_y"] <= start_time_next_station) & station_change ) time_matches = (start_matches & end_matches) |\ (start_matches & (~end_matches) & end_almost_matches) rs2_columns = [c for c in rs2.columns if (c not in common_columns) and (c in data.columns)] +\ [c + "_y" for c in rs2.columns if c + "_y" in data.columns] for c in rs2_columns: data.loc[~time_matches, c] = np.nan data.sort_values(by=["radsatznummer", "begin_der_bearbeitung", "ende_der_bearbeitung_y"], inplace=True) # we keep all the rows that were in rs1 even if there are no # corresponding activities from rs2 keep_row = time_matches | (~data["order"].duplicated(keep="first")) data = data.loc[keep_row].copy(deep=True).reset_index(drop=True) data["ende_der_bearbeitung"] = data[["ende_der_bearbeitung_x", "ende_der_bearbeitung_y"]]\ .max(axis=1) data.drop(["ende_der_bearbeitung_x", "ende_der_bearbeitung_y", "order"], axis=1, inplace=True) return data def merge_rs70(self, data: pd.DataFrame, rs70: pd.DataFrame ) -> pd.DataFrame: ''' ''' data["order"] = data.index data = pd.merge(data, rs70, how="left", on="radsatznummer") time_matches = ( (data["eingabe_datum"] >= data["begin_der_bearbeitung"]) & (data["eingabe_datum"] <= data["ende_der_bearbeitung"])) rs70_columns = [c for c in rs70.columns if (c != "radsatznummer") and (c in data.columns)] +\ [c + "_y" for c in rs70.columns if c + "_y" in data.columns] for c in rs70_columns: data.loc[~time_matches, c] = np.nan data.sort_values(by=["radsatznummer", "begin_der_bearbeitung", "eingabe_datum"], inplace=True) keep_row = time_matches | (~data["order"].duplicated(keep="first")) data = data.loc[keep_row]\ .drop("order", axis=1)\ .reset_index(drop=True) return data def merge_rs0(self, data: pd.DataFrame, rs0: pd.DataFrame) -> pd.DataFrame: ''' ''' data = pd.merge(data, rs0, how="left", on="radsatznummer") no_befundung_mask = (data["positionsnummer"] != 110) for column in ["befundung_code_1", "befundung_code_2", "befundung_code_3"]: data.loc[no_befundung_mask, column] = np.nan return data