123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Mon Sep 30 10:16:23 2019
- @author: tanya
- """
- import pandas as pd
- import numpy as np
- import os
- import sys
- sys.path.append(os.getcwd())
- from libraries.import_process_instances.CleanRs1 import CleanRs1
- class MergeProcessTables:
- '''
- '''
- def merge_rs2(self, data: pd.DataFrame, rs2: pd.DataFrame) -> pd.DataFrame:
- '''
- Difficulty: rows that correspond to one radsatznummer and one station
- in rs1 and rs2 are in many-to-many relation and
- the ende_der_bearbeitung
- for these rows often does not match in the two tables.
- Rules:
- A) We check if the end_der_bearbeitung of
- an activity from rs2 is >=
- begin_der_bearbeitung and <= of
- ende_der_bearbeitung of an entry in rs1
- B) If an activity (row in rs2) has ende_der_bearbeitung
- which is later
- than ende_der_bearbeitung of all the entries in rs1, we check if it
- ended earlier than the begin_der_bearbeitung on the next station.
- If it
- is so, we assign the activity to the latest entry in rs1 for
- this station.
- Same logic applies for merging the table rs70.
- '''
- data = data.copy(deep=True)
- rs2 = rs2.copy(deep=True)
- station_change = (data["positionsnummer"] !=
- data["positionsnummer"].shift(-1))
- data["order"] = data.index
- common_columns = ["radsatznummer", "positionsnummer",
- "positionsname"]
- data = pd.merge(data, rs2, how="left", on=common_columns)
- start_time_next_station =\
- CleanRs1()._get_next_station_start_time(data)\
- .fillna(data["ende_der_bearbeitung_x"])
- start_matches = (data["ende_der_bearbeitung_y"] >=
- data["begin_der_bearbeitung"])
- end_matches = ((data["ende_der_bearbeitung_y"] <=
- data["ende_der_bearbeitung_x"]) |
- data["ende_der_bearbeitung_y"].isnull())
- end_almost_matches = ((data["ende_der_bearbeitung_y"] <=
- start_time_next_station) &
- station_change
- )
- time_matches = (start_matches & end_matches) |\
- (start_matches & (~end_matches) & end_almost_matches)
- rs2_columns = [c for c in rs2.columns
- if (c not in common_columns) and (c in data.columns)] +\
- [c + "_y" for c in rs2.columns
- if c + "_y" in data.columns]
- for c in rs2_columns:
- data.loc[~time_matches, c] = np.nan
- data.sort_values(by=["radsatznummer",
- "begin_der_bearbeitung",
- "ende_der_bearbeitung_y"],
- inplace=True)
- # we keep all the rows that were in rs1 even if there are no
- # corresponding activities from rs2
- keep_row = time_matches | (~data["order"].duplicated(keep="first"))
- data = data.loc[keep_row].copy(deep=True).reset_index(drop=True)
- data["ende_der_bearbeitung"] = data[["ende_der_bearbeitung_x",
- "ende_der_bearbeitung_y"]]\
- .max(axis=1)
- data.drop(["ende_der_bearbeitung_x", "ende_der_bearbeitung_y",
- "order"], axis=1, inplace=True)
- return data
- def merge_rs70(self, data: pd.DataFrame, rs70: pd.DataFrame
- ) -> pd.DataFrame:
- '''
- '''
- data["order"] = data.index
- data = pd.merge(data, rs70, how="left", on="radsatznummer")
- time_matches = (
- (data["eingabe_datum"] >= data["begin_der_bearbeitung"]) &
- (data["eingabe_datum"] <= data["ende_der_bearbeitung"]))
- rs70_columns = [c for c in rs70.columns
- if (c != "radsatznummer") and (c in data.columns)] +\
- [c + "_y" for c in rs70.columns
- if c + "_y" in data.columns]
- for c in rs70_columns:
- data.loc[~time_matches, c] = np.nan
- data.sort_values(by=["radsatznummer", "begin_der_bearbeitung",
- "eingabe_datum"], inplace=True)
- keep_row = time_matches | (~data["order"].duplicated(keep="first"))
- data = data.loc[keep_row]\
- .drop("order", axis=1)\
- .reset_index(drop=True)
- return data
- def merge_rs0(self, data: pd.DataFrame, rs0: pd.DataFrame) -> pd.DataFrame:
- '''
- '''
- data = pd.merge(data, rs0, how="left", on="radsatznummer")
- no_befundung_mask = (data["positionsnummer"] != 110)
- for column in ["befundung_code_1",
- "befundung_code_2",
- "befundung_code_3"]:
- data.loc[no_befundung_mask, column] = np.nan
- return data
|