5 年之前 · 636e15e163
--- a/cdplib/SimplifiedProcessModel.py
+++ b/cdplib/SimplifiedProcessModel.py
@@ -1,178 +0,0 @@
 
				-
			
 
				-import sys
			
 
				-import os
			
 
				-import time
			
 
				-sys.path.append(os.getcwd())
			
 
				-from cdplib.log import Log
			
 
				-log = Log("Simplified Process Model")
			
 
				-
			
 
				-
			
 
				-class SimplifiedProcessModel:
			
 
				-
			
 
				-    def __init__(self):
			
 
				-        self._log = Log("Simplified Process Model")
			
 
				-
			
 
				-    @property
			
 
				-    def process_stages(self):
			
 
				-        '''
			
 
				-        For machine learning predictions we divide the process into
			
 
				-        big stages, stages can be skipped dependeing on the IHS of the
			
 
				-        wheelset. We use all information geathered during the previous
			
 
				-        process stages to make predictions for the next stage.
			
 
				-        '''
			
 
				-        import networkx as nx
			
 
				-        '''
			
 
				-        critical_stations = {"A": [421, 110]}
			
 
				-
			
 
				-        critical_stations["B"] = [130]
			
 
				-
			
 
				-        critical_stations["C"] = [140, 150]
			
 
				-
			
 
				-        critical_stations["D"] = [410]
			
 
				-
			
 
				-        critical_stations["E"] = [510, 520, 535,
			
 
				-                                530, 550]
			
 
				-
			
 
				-        critical_stations["F"] = [490, 480, 430, 170]
			
 
				-
			
 
				-        critical_stations["G"] = [595, 190, 630]
			
 
				-
			
 
				-        critical_stations["H"] = [640]
			
 
				-
			
 
				-        critical_stations["I"] = [650, 560]
			
 
				-
			
 
				-        critical_stations["J"] = [675]
			
 
				-
			
 
				-        critical_stations["K"] = [690]
			
 
				-
			
 
				-        critical_stations["L"] = [710, 670]
			
 
				-
			
 
				-        stages_graph = nx.DiGraph()
			
 
				-
			
 
				-        for stage in critical_stations:
			
 
				-            stages_graph.add_node(stage, stations=critical_stations[stage])
			
 
				-
			
 
				-        stages_graph.add_edge("A", "B")
			
 
				-        stages_graph.add_edge("B", "C")
			
 
				-        stages_graph.add_edge("C", "D")
			
 
				-        stages_graph.add_edge("D", "E")
			
 
				-        stages_graph.add_edge("D", "F")
			
 
				-        stages_graph.add_edge("E", "G")
			
 
				-        stages_graph.add_edge("F", "G")
			
 
				-        stages_graph.add_edge("G", "H")
			
 
				-        stages_graph.add_edge("H", "I")
			
 
				-        stages_graph.add_edge("I", "J")
			
 
				-        stages_graph.add_edge("J", "K")
			
 
				-        stages_graph.add_edge("K", "L")
			
 
				-        '''
			
 
				-
			
 
				-        critical_stations = {"A": [421, 110]}
			
 
				-
			
 
				-        critical_stations["B"] = [130]
			
 
				-
			
 
				-        critical_stations["C"] = [140, 150]
			
 
				-
			
 
				-        critical_stations["D"] = [410]
			
 
				-
			
 
				-        critical_stations["E"] = [510, 520, 535,
			
 
				-                                530, 320, 550]
			
 
				-
			
 
				-        #critical_stations["F"] = [490, 480, 430, 595]
			
 
				-
			
 
				-        #critical_stations["G"] = [170, 506, 430]
			
 
				-
			
 
				-        # Merge the two above since they both contain station 430
			
 
				-        critical_stations["F"] = [490, 480, 430, 595, 170, 506, 430]
			
 
				-
			
 
				-        critical_stations["H"] = [190]
			
 
				-
			
 
				-        critical_stations["I"] = [320, 340, 630]
			
 
				-
			
 
				-        #This path is in parallel with the rest of the paths. handle as an exception
			
 
				-        #critical_stations["J"] = [680]
			
 
				-
			
 
				-        critical_stations["K"] = [640]
			
 
				-
			
 
				-        stages_graph = nx.DiGraph()
			
 
				-        for stage in critical_stations:
			
 
				-            stages_graph.add_node(node_for_adding=stage, stations=critical_stations[stage])
			
 
				-
			
 
				-        stages_graph.add_edge("A", "B")
			
 
				-
			
 
				-        stages_graph.add_edge("B", "C")
			
 
				-        stages_graph.add_edge("B", "K")
			
 
				-        #stages_graph.add_edge("B", "J")
			
 
				-
			
 
				-        stages_graph.add_edge("C", "D")
			
 
				-
			
 
				-        stages_graph.add_edge("D", "E")
			
 
				-        stages_graph.add_edge("D", "F")
			
 
				-        #stages_graph.add_edge("D", "G")
			
 
				-        stages_graph.add_edge("D", "H")
			
 
				-
			
 
				-        stages_graph.add_edge("E", "H")
			
 
				-
			
 
				-        #stages_graph.add_edge("F", "G")
			
 
				-
			
 
				-        #stages_graph.add_edge("G", "H")
			
 
				-
			
 
				-        stages_graph.add_edge("E", "I")
			
 
				-        stages_graph.add_edge("H", "I")
			
 
				-
			
 
				-        #stages_graph.add_edge("J", "K")
			
 
				-
			
 
				-        stages_graph.add_edge("I", "K")
			
 
				-
			
 
				-        return stages_graph
			
 
				-
			
 
				-    def process_stages_paths(self):
			
 
				-        import networkx as nx
			
 
				-        result_graph = self.process_stages
			
 
				-
			
 
				-        result_dict = {}
			
 
				-        stage_list = result_graph.nodes()
			
 
				-        for stage in stage_list:
			
 
				-            for path in nx.all_simple_paths(result_graph, source="A", target=stage):
			
 
				-                station_list = []
			
 
				-                for stage in path:
			
 
				-                        for station in result_graph.nodes()[stage]['stations']:
			
 
				-                            station_list.append(station)
			
 
				-                result_dict[str(path)] = station_list
			
 
				-        return result_dict
			
 
				-
			
 
				-    def get_stage(self, station_nummer: int):
			
 
				-        import networkx as nx
			
 
				-        result_graph = self.process_stages
			
 
				-        for path in nx.all_simple_paths(result_graph, source="A", target="K"):
			
 
				-            station_list = []
			
 
				-            for stage in path:
			
 
				-                    for station in result_graph.nodes()[stage]['stations']:
			
 
				-                        if station == station_nummer:
			
 
				-                            return stage
			
 
				-        return
			
 
				-
			
 
				-    def calculate_stage_path(self, stations_list: list):
			
 
				-        position_list = list(self.process_stages.nodes())
			
 
				-        current_index = 0
			
 
				-        stage_path = []
			
 
				-        for station in stations_list:
			
 
				-            stage = self.get_stage(station)
			
 
				-            if stage is not None:
			
 
				-                new_index = position_list.index(stage)
			
 
				-                if current_index <= new_index:
			
 
				-                    if stage not in stage_path:
			
 
				-                        stage_path.append(stage)
			
 
				-                        current_index = new_index
			
 
				-                else:
			
 
				-                    self._log.warning('Path does not conform to the model, no prediction can be made')
			
 
				-                    return
			
 
				-        return str(stage_path)
			
 
				-
			
 
				-    def get_stations_for_prediction_model(self, stations_list):
			
 
				-        stages_paths = self.process_stages_paths()
			
 
				-        prediction_stations = stages_paths[self.calculate_stage_path(stations_list)]
			
 
				-
			
 
				-        # Hack solution for handling that station 680 is running in parallel with most other stations
			
 
				-        if len(prediction_stations) > 3:
			
 
				-            prediction_stations.append(680)
			
 
				-        return prediction_stations
			
--- a/cdplib/data_cleaning/DataCleaningUtils.py
+++ b/cdplib/data_cleaning/DataCleaningUtils.py
@@ -1,63 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Fri Sep 27 16:20:03 2019
			
 
				-
			
 
				-@author: tanya
			
 
				-"""
			
 
				-
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				-
			
 
				-
			
 
				-class CleaningUtils:
			
 
				-    '''
			
 
				-    '''
			
 
				-    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
			
 
				-        '''
			
 
				-        '''
			
 
				-        formats = list(formats)
			
 
				-
			
 
				-        converted = pd.Series([pd.to_datetime(np.nan)]*len(series))
			
 
				-
			
 
				-        for formt in formats:
			
 
				-            if formt == "%d%m%Y":
			
 
				-                missing_leading_zero = (series.astype(str).str.len() == 7)
			
 
				-
			
 
				-                series = series.astype(str)
			
 
				-
			
 
				-                series.loc[missing_leading_zero] = "0" +\
			
 
				-                    series.loc[missing_leading_zero]
			
 
				-
			
 
				-            converted_this_format = pd.to_datetime(series,
			
 
				-                                                   format=formt,
			
 
				-                                                   errors="coerce")
			
 
				-
			
 
				-            converted.fillna(converted_this_format, inplace=True)
			
 
				-
			
 
				-        return converted
			
 
				-
			
 
				-    def standarize_writing(self, s: str, to_lowercase: bool = True):
			
 
				-        '''
			
 
				-        '''
			
 
				-        import re
			
 
				-
			
 
				-        german_character_mapping = {"ß": "ss",
			
 
				-                                    "ü": "ue",
			
 
				-                                    "Ü": "Ue",
			
 
				-                                    "ä": "ae",
			
 
				-                                    "Ä": "Ae",
			
 
				-                                    "ö": "oe",
			
 
				-                                    "Ö": "Oe"}
			
 
				-
			
 
				-        s = s.encode('raw_unicode_escape').decode('raw_unicode_escape')
			
 
				-        for char, correct_char in german_character_mapping.items():
			
 
				-            s = s.replace(char, correct_char)
			
 
				-
			
 
				-        if to_lowercase:
			
 
				-            s = s.lower()
			
 
				-
			
 
				-        s = re.sub('[^0-9a-zA-Z]+', '_', s).lstrip("_").rstrip("_")
			
 
				-
			
 
				-        return s
			
 
				-
			
--- a/cdplib/data_cleaning/__init__.py
+++ b/cdplib/data_cleaning/__init__.py
@@ -1 +0,0 @@
 
				-from .DataCleaningUtils import *
			
--- a/cdplib/db_handlers/OebbMongodbHandler.py
+++ b/cdplib/db_handlers/OebbMongodbHandler.py
@@ -1,151 +0,0 @@
 
				-"""
			
 
				-Created on Mon Sep 16 13:27:44 2019
			
 
				-
			
 
				-@author: oskar
			
 
				-@description: Oebb specific Mongodb Handler which inherits from the 'general' Mongodb Handler
			
 
				-"""
			
 
				-
			
 
				-
			
 
				-import json
			
 
				-import simplejson
			
 
				-import sys
			
 
				-import os
			
 
				-import jsonref
			
 
				-
			
 
				-from copy import deepcopy
			
 
				-from pymongo import MongoClient
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				-
			
 
				-sys.path.append(os.getcwd())
			
 
				-from libraries.log import Log
			
 
				-from libraries.configuration import default as cfg
			
 
				-from libraries.db_handlers.MongodbHandler import MongodbHandler
			
 
				-from libraries.Singleton_Threadsafe import SingletonThreadsafe
			
 
				-
			
 
				-
			
 
				-class OebbMongodbHandlerPool(metaclass=SingletonThreadsafe):
			
 
				-    '''
			
 
				-    '''
			
 
				-
			
 
				-    def __init__(self, size: int = 10):
			
 
				-        self._size = size
			
 
				-        self._mongodb_handlers = [OebbMongodbHandler() for _ in range(size)]
			
 
				-
			
 
				-    def aquire(self):
			
 
				-        while not self._mongodb_handlers:
			
 
				-            self._mongodb_handlers = [OebbMongodbHandler() for _ in range(self._size)]
			
 
				-            log.warning("Ran out of Mongodb handlers, 10 more have been added. Are you sure you've returned yours?")
			
 
				-        return self._mongodb_handlers.pop()
			
 
				-        
			
 
				-    def release(self, mongodb_handler):
			
 
				-        if len(self._mongodb_handlers) < self._size:
			
 
				-            self._mongodb_handlers.append(mongodb_handler)
			
 
				-
			
 
				-
			
 
				-class OebbMongodbHandler(MongodbHandler):
			
 
				-
			
 
				-    def __init__(self, database_url: str = None,
			
 
				-                 database_name: str = None):
			
 
				-        '''
			
 
				-        '''
			
 
				-        super().__init__(
			
 
				-                database_url=database_url,
			
 
				-                database_name=database_name)
			
 
				-
			
 
				-        self._log = Log("Oebb Mongodb Handler")
			
 
				-
			
 
				-
			
 
				-    def query_process_instances_with_stage_filtering(self, stage_path: list):
			
 
				-        '''
			
 
				-        :param list stage_path: Stage path for which the data will be queried. 
			
 
				-
			
 
				-        Method will return all wheelsets which stage_path is a subset of their final stage path. 
			
 
				-        The wheelsets will only be returned with the actual stations from the stages in the stage_path.
			
 
				-        '''
			
 
				-    
			
 
				-        assert(isinstance(stage_path, list)),\
			
 
				-            "Parameter 'stage_path must be a list type"
			
 
				-
			
 
				-        aggregation_pipeline = [
			
 
				-                                {"$match": {"$expr": {"$setIsSubset": [stage_path, '$final_state.stage_path']}}},
			
 
				-                                {"$unwind": "$process"},
			
 
				-                                {"$sort": {"process.step_number": 1}},
			
 
				-                                {"$group": {
			
 
				-                                    "_id": {
			
 
				-                                            "_id": "$_id", 
			
 
				-                                            "radsatznummer": "$radsatznummer",
			
 
				-                                            "final_state": "$final_state"
			
 
				-                                            }, 
			
 
				-                                        "process": {"$push": "$process"}
			
 
				-                                    }
			
 
				-                                },
			
 
				-                                {"$project": {
			
 
				-                                    "_id": "$_id._id",
			
 
				-                                    "radsatznummer": "$_id.radsatznummer",
			
 
				-                                    "final_state": "$_id.final_state",
			
 
				-                                    "process":{
			
 
				-                                        "$filter":{
			
 
				-                                            "input": "$process",
			
 
				-                                            "as": "process",
			
 
				-                                            "cond":{
			
 
				-                                                "$setIsSubset": [["$$process.stage"], stage_path]
			
 
				-                                            },      
			
 
				-                                        }
			
 
				-                                    }
			
 
				-                                }
			
 
				-                                }
			
 
				-                            ]
			
 
				-
			
 
				-        return self.aggregate_data_and_generate_dataframe('process_instances',aggregation_pipeline)                
			
 
				-
			
 
				-    def query_process_instances_with_guaranteed_station_sorting(self, attribute: str = None, 
			
 
				-                                                attribute_value: str = None, comparison_operator: str = '$eq', ascending: bool = True):
			
 
				-        '''
			
 
				-        :param str attribute:
			
 
				-        :param str attribute_value:
			
 
				-        :param str comparison_operator:
			
 
				-        :param bool ascending:
			
 
				-
			
 
				-        Method will return all wheelsets which stage_path is a subset of their final stage path. 
			
 
				-        The wheelsets will only be returned with the actual stations from the stages in the stage_path.
			
 
				-        '''
			
 
				-        if attribute is not None:
			
 
				-            assert(isinstance(attribute, str)),\
			
 
				-                "Parameter 'attribute' must be a string type"
			
 
				-
			
 
				-        if attribute_value is not None:
			
 
				-            assert(isinstance(attribute_value, (str, int, bool))),\
			
 
				-                "Parameter 'attribute_value' must be a string, integer or boolean type"
			
 
				-    
			
 
				-        assert(isinstance(comparison_operator, str)),\
			
 
				-            "Parameter 'comparison_operator' must be a string type"
			
 
				-
			
 
				-        assert(isinstance(ascending, bool)),\
			
 
				-            "Parameter 'ascending' must be a boolean type"
			
 
				-
			
 
				-        
			
 
				-        order = 1 if ascending else -1
			
 
				-
			
 
				-        aggregation_pipeline = [
			
 
				-                                {"$match":{ attribute:{comparison_operator: attribute_value}}},
			
 
				-                                {"$unwind": "$process"},
			
 
				-                                {"$sort": {"process.step_number": order}},
			
 
				-                                {"$group": {
			
 
				-                                    "_id": {
			
 
				-                                            "_id": "$_id", 
			
 
				-                                            "radsatznummer": "$radsatznummer",
			
 
				-                                            "final_state": "$final_state"
			
 
				-                                            }, 
			
 
				-                                        "process": {"$push": "$process"}
			
 
				-                                    }
			
 
				-                                },
			
 
				-                                {"$project": {
			
 
				-                                    "_id": "$_id._id",
			
 
				-                                    "radsatznummer": "$_id.radsatznummer",
			
 
				-                                    "final_state": "$_id.final_state",
			
 
				-                                    "process": "$process"}
			
 
				-                                }
			
 
				-                            ]
			
 
				-                    
			
 
				-        return self.aggregate_data_and_generate_dataframe('process_instances',aggregation_pipeline)
			
--- a/cdplib/db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
+++ b/cdplib/db_handlers/__pycache__/MongodbHandler.cpython-37.pyc
--- a/cdplib/db_handlers/__pycache__/SQLHandler.cpython-37.pyc
+++ b/cdplib/db_handlers/__pycache__/SQLHandler.cpython-37.pyc
--- a/cdplib/db_handlers/__pycache__/SQLOperations.cpython-37.pyc
+++ b/cdplib/db_handlers/__pycache__/SQLOperations.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/DataFrameToCollection.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/MigrationCleaning.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/ParseDbSchema.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/ParseJsonSchema.cpython-37.pyc
--- a/cdplib/db_migration/__pycache__/ParseMapping.cpython-37.pyc
+++ b/cdplib/db_migration/__pycache__/ParseMapping.cpython-37.pyc
--- a/cdplib/utils/__pycache__/ClassLogging.cpython-37.pyc
+++ b/cdplib/utils/__pycache__/ClassLogging.cpython-37.pyc
--- a/cdplib/utils/__pycache__/CleaningUtils.cpython-37.pyc
+++ b/cdplib/utils/__pycache__/CleaningUtils.cpython-37.pyc
--- a/cdplib/utils/__pycache__/ExceptionsHandler.cpython-37.pyc
+++ b/cdplib/utils/__pycache__/ExceptionsHandler.cpython-37.pyc