MergeProcessTables.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Mon Sep 30 10:16:23 2019
  5. @author: tanya
  6. """
  7. import pandas as pd
  8. import numpy as np
  9. import os
  10. import sys
  11. sys.path.append(os.getcwd())
  12. from libraries.import_process_instances.CleanRs1 import CleanRs1
  13. class MergeProcessTables:
  14. '''
  15. '''
  16. def merge_rs2(self, data: pd.DataFrame, rs2: pd.DataFrame) -> pd.DataFrame:
  17. '''
  18. Difficulty: rows that correspond to one radsatznummer and one station
  19. in rs1 and rs2 are in many-to-many relation and
  20. the ende_der_bearbeitung
  21. for these rows often does not match in the two tables.
  22. Rules:
  23. A) We check if the end_der_bearbeitung of
  24. an activity from rs2 is >=
  25. begin_der_bearbeitung and <= of
  26. ende_der_bearbeitung of an entry in rs1
  27. B) If an activity (row in rs2) has ende_der_bearbeitung
  28. which is later
  29. than ende_der_bearbeitung of all the entries in rs1, we check if it
  30. ended earlier than the begin_der_bearbeitung on the next station.
  31. If it
  32. is so, we assign the activity to the latest entry in rs1 for
  33. this station.
  34. Same logic applies for merging the table rs70.
  35. '''
  36. data = data.copy(deep=True)
  37. rs2 = rs2.copy(deep=True)
  38. station_change = (data["positionsnummer"] !=
  39. data["positionsnummer"].shift(-1))
  40. data["order"] = data.index
  41. common_columns = ["radsatznummer", "positionsnummer",
  42. "positionsname"]
  43. data = pd.merge(data, rs2, how="left", on=common_columns)
  44. start_time_next_station =\
  45. CleanRs1()._get_next_station_start_time(data)\
  46. .fillna(data["ende_der_bearbeitung_x"])
  47. start_matches = (data["ende_der_bearbeitung_y"] >=
  48. data["begin_der_bearbeitung"])
  49. end_matches = ((data["ende_der_bearbeitung_y"] <=
  50. data["ende_der_bearbeitung_x"]) |
  51. data["ende_der_bearbeitung_y"].isnull())
  52. end_almost_matches = ((data["ende_der_bearbeitung_y"] <=
  53. start_time_next_station) &
  54. station_change
  55. )
  56. time_matches = (start_matches & end_matches) |\
  57. (start_matches & (~end_matches) & end_almost_matches)
  58. rs2_columns = [c for c in rs2.columns
  59. if (c not in common_columns) and (c in data.columns)] +\
  60. [c + "_y" for c in rs2.columns
  61. if c + "_y" in data.columns]
  62. for c in rs2_columns:
  63. data.loc[~time_matches, c] = np.nan
  64. data.sort_values(by=["radsatznummer",
  65. "begin_der_bearbeitung",
  66. "ende_der_bearbeitung_y"],
  67. inplace=True)
  68. # we keep all the rows that were in rs1 even if there are no
  69. # corresponding activities from rs2
  70. keep_row = time_matches | (~data["order"].duplicated(keep="first"))
  71. data = data.loc[keep_row].copy(deep=True).reset_index(drop=True)
  72. data["ende_der_bearbeitung"] = data[["ende_der_bearbeitung_x",
  73. "ende_der_bearbeitung_y"]]\
  74. .max(axis=1)
  75. data.drop(["ende_der_bearbeitung_x", "ende_der_bearbeitung_y",
  76. "order"], axis=1, inplace=True)
  77. return data
  78. def merge_rs70(self, data: pd.DataFrame, rs70: pd.DataFrame
  79. ) -> pd.DataFrame:
  80. '''
  81. '''
  82. data["order"] = data.index
  83. data = pd.merge(data, rs70, how="left", on="radsatznummer")
  84. time_matches = (
  85. (data["eingabe_datum"] >= data["begin_der_bearbeitung"]) &
  86. (data["eingabe_datum"] <= data["ende_der_bearbeitung"]))
  87. rs70_columns = [c for c in rs70.columns
  88. if (c != "radsatznummer") and (c in data.columns)] +\
  89. [c + "_y" for c in rs70.columns
  90. if c + "_y" in data.columns]
  91. for c in rs70_columns:
  92. data.loc[~time_matches, c] = np.nan
  93. data.sort_values(by=["radsatznummer", "begin_der_bearbeitung",
  94. "eingabe_datum"], inplace=True)
  95. keep_row = time_matches | (~data["order"].duplicated(keep="first"))
  96. data = data.loc[keep_row]\
  97. .drop("order", axis=1)\
  98. .reset_index(drop=True)
  99. return data
  100. def merge_rs0(self, data: pd.DataFrame, rs0: pd.DataFrame) -> pd.DataFrame:
  101. '''
  102. '''
  103. data = pd.merge(data, rs0, how="left", on="radsatznummer")
  104. no_befundung_mask = (data["positionsnummer"] != 110)
  105. for column in ["befundung_code_1",
  106. "befundung_code_2",
  107. "befundung_code_3"]:
  108. data.loc[no_befundung_mask, column] = np.nan
  109. return data