%!s(int64=5) %!d(string=hai) anos · 7ca977d0cf
--- a/cdplib/DataExplorer/DataExplorer.py
+++ b/cdplib/DataExplorer/DataExplorer.py
@@ -0,0 +1,602 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+import time
			
 
				+from pprint import pprint
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import datetime
			
 
				+sys.path.append(os.getcwd())
			
 
				+from copy import deepcopy
			
 
				+from cdplib.log import Log
			
 
				+from cdplib.FlattenData import FlattenData
			
 
				+from libraries.SimplifiedProcessModel import SimplifiedProcessModel
			
 
				+from libraries.base_path_prediction.Base_Path_Predictor import Base_Path_Predictor
			
 
				+from libraries.configuration import default as cfg
			
 
				+from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
			
 
				+
			
 
				+class DataExplorer:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self._log = Log("Data Explorer")
			
 
				+
			
 
				+    def calculate_correlation(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False):
			
 
				+
			
 
				+        no_nan_data = data.fillna(0)
			
 
				+        correlations = []
			
 
				+        
			
 
				+        pearson_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='pearson')
			
 
				+        self._log.info('Calculated Pearson correlation')
			
 
				+
			
 
				+        if verbose:
			
 
				+            kendall_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='kendall')
			
 
				+            correlations.append(kendall_correlation)
			
 
				+            self._log.info('Calculated Kendall correlation')
			
 
				+
			
 
				+            spearman_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='spearman')
			
 
				+            correlations.append(spearman_correlation)
			
 
				+            self._log.info('Calculated Spearman correlation')
			
 
				+
			
 
				+            cosine_similarity = self.calculate_cosine_similarity_for_dataframe_columns(no_nan_data, column_name_to_predict)
			
 
				+            correlations.append(cosine_similarity)
			
 
				+            self._log.info('Calculated cosine similarity')
			
 
				+
			
 
				+            chi2_independence = self.calculate_chi2_independence_for_dataframe_columns(data, column_name_to_predict, verbose)
			
 
				+            correlations.append(chi2_independence)
			
 
				+            self._log.info('Calculated chi2 independence')
			
 
				+
			
 
				+            fisher_exact_test = self.calculate_fisher_exact_for_dataframe_column(data, column_name_to_predict)
			
 
				+            correlations.append(fisher_exact_test)
			
 
				+            self._log.info('Calculated Fisher Exact Test')
			
 
				+
			
 
				+        gert_correlation = self.calculate_feature_and_schrott_occurunces(data, column_name_to_predict, prediction_name, verbose)
			
 
				+        correlations.append(gert_correlation)
			
 
				+        self._log.info('Calculated Gert correlation')
			
 
				+        
			
 
				+        merged_data = pearson_correlation
			
 
				+        for correlation in correlations:
			
 
				+            if len(correlation.index) > 0:
			
 
				+                merged_data = merged_data.join(correlation, how='left')
			
 
				+        
			
 
				+        return merged_data
			
 
				+     
			
 
				+
			
 
				+    def calculate_big_matrix_correlation(self, data: pd.DataFrame, column_name_to_predict: str, method: str='pearson') -> pd.DataFrame():
			
 
				+
			
 
				+        num_columns = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']
			
 
				+        result_data = {}
			
 
				+        for column in data.columns:
			
 
				+            if data[column].dtype in num_columns: 
			
 
				+                result_data[column] = data[column_name_to_predict].corr(data[column], method=method)
			
 
				+        
			
 
				+        result_df = pd.DataFrame.from_dict(result_data, orient='index')
			
 
				+        label_string = method + ' correlation'
			
 
				+        result_df.columns = [label_string]
			
 
				+        return result_df
			
 
				+        
			
 
				+    def calculate_feature_and_schrott_occurunces(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False) -> pd.DataFrame():
			
 
				+
			
 
				+        result_dict = {}
			
 
				+        len_data = len(data.index)
			
 
				+        total_schrott = len(data[data[column_name_to_predict] == 1].index)
			
 
				+        counter = 0
			
 
				+        for column in data.columns:
			
 
				+            occurunces = 0
			
 
				+            schrott_occurunces = 0
			
 
				+            temp_data = data[data[column] == 1]
			
 
				+            #print('Temp Data:', temp_data)
			
 
				+            if len(temp_data.index) > 0:
			
 
				+                occurunces = len(temp_data.index)
			
 
				+                schrott_occurunces = len(temp_data[temp_data[column_name_to_predict] == 1].index)
			
 
				+                schrott_wheelsets = list(temp_data[temp_data[column_name_to_predict] == 1].index)
			
 
				+                non_schrott = occurunces - schrott_occurunces
			
 
				+                non_occurunces = len_data - occurunces
			
 
				+                percentage_of_occurances_schrott = round((schrott_occurunces/occurunces)*100, 2)
			
 
				+                percentage_of_occurances_not_schrott = round((non_schrott/occurunces)*100, 2)
			
 
				+                
			
 
				+                if verbose:
			
 
				+                    result_dict[column] = {
			
 
				+                        'Occurs':occurunces,
			
 
				+                        '%_Occurs': occurunces/len_data,
			
 
				+
			
 
				+                        prediction_name + '_Occurs':schrott_occurunces,
			
 
				+                        '%_' + prediction_name + '_Occurs': percentage_of_occurances_schrott,
			
 
				+                        '%_Total_' + prediction_name: round((schrott_occurunces/total_schrott)*100, 2),
			
 
				+
			
 
				+                        '!Occurs': non_occurunces,
			
 
				+                        '%_!Occurs': round((non_occurunces/len_data)*100, 2),
			
 
				+
			
 
				+                        'Occurs_!' + prediction_name: non_schrott,
			
 
				+                        '%_Occurs_!' + prediction_name: percentage_of_occurances_not_schrott,
			
 
				+                        '%_Total_!' + prediction_name: round((non_schrott/len_data)*100, 2),
			
 
				+
			
 
				+                        'Wheelsets_!' + prediction_name: [wheelset for wheelset in list(temp_data.index) if wheelset not in schrott_wheelsets],
			
 
				+                        'Wheelsets_' + prediction_name: schrott_wheelsets
			
 
				+                    }
			
 
				+                else:
			
 
				+                    result_dict[column] = {
			
 
				+
			
 
				+                        'Occurs':occurunces,
			
 
				+                        '%_Occurs': occurunces/len_data,
			
 
				+
			
 
				+                        'Schrott_Occurs':schrott_occurunces,
			
 
				+                        '%_Schrott_Occurs': percentage_of_occurances_schrott,
			
 
				+                        '%_Total_Schrott': schrott_occurunces/total_schrott,
			
 
				+
			
 
				+                    }
			
 
				+
			
 
				+                counter+=1
			
 
				+                if counter % 100 == 0:
			
 
				+                    print(('Calculated {} / {}').format(counter, len(data.columns)))
			
 
				+            
			
 
				+        return pd.DataFrame.from_dict(result_dict, orient='index')
			
 
				+
			
 
				+    def calculate_fisher_exact_for_dataframe_column(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
			
 
				+      
			
 
				+        from scipy.stats import fisher_exact
			
 
				+        result_dict = {}
			
 
				+        predict_count = data[column_name_to_predict]
			
 
				+
			
 
				+        for column in data.columns:
			
 
				+            
			
 
				+            fisher_data = pd.crosstab(predict_count, data[column])
			
 
				+            if fisher_data.shape == (2, 2):
			
 
				+                oddsratio, pvalue = fisher_exact(fisher_data)
			
 
				+                result_dict[column] = {
			
 
				+                                        'Fisher_Exact_pvalue': pvalue,
			
 
				+                                        'Fisher_Exact_oddsratio': oddsratio
			
 
				+                                        }
			
 
				+            else:
			
 
				+                result_dict[column] = {
			
 
				+                                        'Fisher_Exact_pvalue': None,
			
 
				+                                        'Fisher_Exact_oddsratio': None
			
 
				+                                        }
			
 
				+        return pd.DataFrame.from_dict(result_dict, orient='index')
			
 
				+        
			
 
				+
			
 
				+    def calculate_cosine_similarity_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
			
 
				+        from sklearn.metrics.pairwise import cosine_similarity
			
 
				+        cosine_corr = cosine_similarity(data.transpose())
			
 
				+        cosine_df = pd.DataFrame(cosine_corr, columns=data.columns, index=data.columns)
			
 
				+        cosine_df = pd.DataFrame(cosine_df[column_name_to_predict])
			
 
				+        cosine_df.columns = ['Cosine Similarity']
			
 
				+
			
 
				+        return cosine_df
			
 
				+
			
 
				+    def calculate_chi2_independence_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str, verbose: bool = False) -> pd.DataFrame():
			
 
				+        from scipy.stats import chi2_contingency
			
 
				+
			
 
				+        result_dict = {}
			
 
				+        Y = data[column_name_to_predict]
			
 
				+        for column in data.columns:
			
 
				+            X = data[column]
			
 
				+
			
 
				+            observed = pd.crosstab(Y,X)
			
 
				+            chi2=None
			
 
				+            p=None
			
 
				+            dof=None
			
 
				+            expected=None
			
 
				+            if observed.size == 4:
			
 
				+                chi2, p, dof, expected = chi2_contingency(observed.values)
			
 
				+
			
 
				+                if dof > 1:
			
 
				+                    self._log.warning(('Calculation contained DOF > 1, this is the data:\n {}').format(X))
			
 
				+
			
 
				+                if verbose:
			
 
				+                    result_dict[column] = {
			
 
				+                                            'Chi2': chi2,
			
 
				+                                            'Chi2, p':p,
			
 
				+                                            'Chi2, dof':dof,
			
 
				+                                            'Chi2, expected':expected
			
 
				+                                        }
			
 
				+                else:
			
 
				+                    result_dict[column] = {
			
 
				+                                            'Chi2, p':p,
			
 
				+                                        }
			
 
				+            else:
			
 
				+                if verbose:
			
 
				+                    result_dict[column] = {
			
 
				+                                            'Chi2': None,
			
 
				+                                            'Chi2, p':None,
			
 
				+                                            'Chi2, dof':None,
			
 
				+                                            'Chi2, expected':None
			
 
				+                                        }
			
 
				+                else:
			
 
				+                    result_dict[column] = {
			
 
				+                                            'Chi2, p':None,
			
 
				+                                        }
			
 
				+            
			
 
				+        return pd.DataFrame.from_dict(result_dict, orient='index')
			
 
				+
			
 
				+    def calculate_machine_learning_scores(self, predictions, true_values, y_train, to_date:str = None):
			
 
				+        '''
			
 
				+        Calculated several different measures on the predictions compared to the true values, returns 2 DataFrames, one to be submitted to Mongodb and one to be saved to excel.
			
 
				+        
			
 
				+        :param predictions: The machine learning predictions
			
 
				+        :param true_values: The correct predictions for the dataset
			
 
				+        :param y_train: The correct predictions for the training data.
			
 
				+        '''
			
 
				+        from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, balanced_accuracy_score  
			
 
				+
			
 
				+        if to_date is None:
			
 
				+            to_date = str(datetime.datetime.utcnow())
			
 
				+
			
 
				+        confusion_matrix = confusion_matrix(true_values, predictions)
			
 
				+        #normalized_confusion_matrix = confusion_matrix(true_values, predictions, normalize=True)
			
 
				+        normalized_confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
			
 
				+        classification_report = classification_report(true_values, predictions)
			
 
				+        accuracy_score = round(accuracy_score(true_values, predictions)*100, 2)
			
 
				+        roc_auc_score = round(roc_auc_score(true_values, predictions)*100,2)
			
 
				+        balanced_custom_cost_score = self.balanced_custom_cost_function(true_values, predictions)
			
 
				+        custom_cost_score = self.custom_cost_function(true_values, predictions)
			
 
				+        #roc_auc_score = 'Unknown'
			
 
				+        balanced_accuracy = round(balanced_accuracy_score(true_values, predictions)*100, 2)
			
 
				+        
			
 
				+        
			
 
				+        print("Timestamp\n", to_date)
			
 
				+        print('\n')
			
 
				+        print(('% True Positives = {}').format(confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1][0])))
			
 
				+        print(('% False Positives = {}').format(confusion_matrix[0][1]/(confusion_matrix[0][1]+confusion_matrix[0][0])))
			
 
				+        print('\n')
			
 
				+        print('Accuracy:\n', accuracy_score)
			
 
				+        print('\n')
			
 
				+        print('Balanced Accuracy:\n', balanced_accuracy)
			
 
				+        print('\n')
			
 
				+        print('ROC AUC Score:\n', roc_auc_score)
			
 
				+        print('\n')
			
 
				+        print('Balanced Custom Cost Function:\n', balanced_custom_cost_score)
			
 
				+        print('\n')
			
 
				+        print('Custom Cost Function:\n', custom_cost_score)
			
 
				+        print('\n')
			
 
				+        print('Confusion Matrix:\n', confusion_matrix)
			
 
				+        print('\n')
			
 
				+        print('Normalized Confusion Matrix:\n', normalized_confusion_matrix)
			
 
				+        print('\n')
			
 
				+        print('Classification Report:\n', classification_report)
			
 
				+        
			
 
				+        result_dict = {
			
 
				+            'Size of training set:': len(y_train),
			
 
				+            'Number of Schrott in training set': y_train.value_counts().loc[True],
			
 
				+            'Size of prediction set:': len(true_values),
			
 
				+            'Number of Schrott in prediction set': true_values.value_counts().loc[True],
			
 
				+            'Confusion Matrix': confusion_matrix,
			
 
				+            'Normalized Confusion Matrix': normalized_confusion_matrix,
			
 
				+            'Accuracy': accuracy_score,
			
 
				+            'Balanced Accuracy': balanced_accuracy,
			
 
				+            'ROC AUC Score:': roc_auc_score,
			
 
				+            'Balanced Custom Cost Score:': balanced_custom_cost_score,
			
 
				+            'Custom Cost Score:': custom_cost_score
			
 
				+        }
			
 
				+
			
 
				+        confusion_matrix = confusion_matrix.ravel()
			
 
				+
			
 
				+        database_dict = {
			
 
				+                "timestamp": to_date,
			
 
				+                "result": {
			
 
				+                    "training_set":{
			
 
				+                        "amount": int(len(y_train)),
			
 
				+                        "schrott": int(y_train.value_counts().loc[True])
			
 
				+                    },
			
 
				+                    "test_set":{
			
 
				+                        "amount": int(len(true_values)),
			
 
				+                        "schrott": int(true_values.value_counts().loc[True])
			
 
				+                    },
			
 
				+                    "confusion_matrix":[
			
 
				+                        ("True Negatives", int(confusion_matrix[0])),
			
 
				+                        ("False Positives", int(confusion_matrix[1])),
			
 
				+                        ("False Negatives", int(confusion_matrix[2])),
			
 
				+                        ("True Positives", int(confusion_matrix[3]))
			
 
				+                    ],
			
 
				+                    "normalized_confusion_matrix":[
			
 
				+                        ("True Negatives", round(normalized_confusion_matrix[0][0]*100, 2)),
			
 
				+                        ("False Positives", round(normalized_confusion_matrix[0][1]*100, 2)),
			
 
				+                        ("False Negatives", round(normalized_confusion_matrix[1][0]*100, 2)),
			
 
				+                        ("True Positives", round(normalized_confusion_matrix[1][1]*100, 2))
			
 
				+                    ],
			
 
				+                    "accuracy": accuracy_score,
			
 
				+                    "balanced_accuracy": balanced_accuracy,
			
 
				+                    "ROC_AUC_score": roc_auc_score,
			
 
				+                    'balanced_custom_cost_score': int(balanced_custom_cost_score),
			
 
				+                    'custom_cost_score': int(custom_cost_score)
			
 
				+                }   
			
 
				+        }
			
 
				+
			
 
				+        pprint(database_dict)
			
 
				+        
			
 
				+        #result_dict={}
			
 
				+        #database_dict={}
			
 
				+        return result_dict, database_dict
			
 
				+
			
 
				+    def get_total_prediction_results(self, in_data, limit: float = 0.5, station = None):
			
 
				+
			
 
				+        data = deepcopy(in_data)
			
 
				+
			
 
				+        data_flattener = FlattenData()
			
 
				+        data['ist_schrott'] = data_flattener.flatten(data['final_state'])
			
 
				+        predictions_lists = []  
			
 
				+        stations_lists = []
			
 
				+        num_correct_predictions = []
			
 
				+        num_wrong_predictions = []
			
 
				+        
			
 
				+        for true_value, predictions in zip(data['ist_schrott'], data['process']):
			
 
				+
			
 
				+            predictions_list, stations_list= self.filter_by_threshold_and_station(predictions, limit, station)
			
 
				+            predictions_lists.append(predictions_list)
			
 
				+            stations_lists.append(stations_list)
			
 
				+
			
 
				+            num_true = sum(predictions_list)
			
 
				+            num_false = len(predictions_list) - sum(predictions_list)
			
 
				+            if true_value:
			
 
				+                
			
 
				+                num_correct_predictions.append(num_true)
			
 
				+                num_wrong_predictions.append(num_false)
			
 
				+            else:
			
 
				+                num_correct_predictions.append(num_false)
			
 
				+                num_wrong_predictions.append(num_true)
			
 
				+            
			
 
				+        # Fill DataFrame
			
 
				+        data['stations'] = stations_lists
			
 
				+        data['predictions'] = predictions_lists
			
 
				+        data['num_predictions'] = data['predictions'].str.len()
			
 
				+        data['num_correct_predictions'] = num_correct_predictions
			
 
				+        data['%_correct_predictions'] = round(data['num_correct_predictions'] / data['num_predictions'], 3)*100
			
 
				+        data['num_wrong_predictions'] = num_wrong_predictions
			
 
				+        data['%_wrong_predictions'] = round(data['num_wrong_predictions'] / data['num_predictions'], 3)*100
			
 
				+        '''
			
 
				+        print('LIMIT:', limit)
			
 
				+        num_wheelsets = len(data.index) 
			
 
				+
			
 
				+        # Print results
			
 
				+        print('Predictions for station:', station if not None else 'All stations')
			
 
				+        print('Wheelsets predicted:', num_wheelsets)
			
 
				+        print('Predictions made:', data['num_predictions'].sum())
			
 
				+        '''
			
 
				+        return_dict = {}
			
 
				+
			
 
				+        if station is None:
			
 
				+            station = 'All Stations'
			
 
				+            return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
			
 
				+            return_dict[station+' per Wheelset'] = {'result': self.calculate_statistics_per_wheelset(data)}
			
 
				+        else:
			
 
				+            return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
			
 
				+
			
 
				+        data.drop(['final_state', 'process'], axis=1, inplace=True)
			
 
				+
			
 
				+        return return_dict
			
 
				+
			
 
				+    def filter_by_threshold_and_station(self, schrott_list: list, limit: float = 0.5, station=None):
			
 
				+    
			
 
				+        
			
 
				+        prediction_list = []
			
 
				+        station_list = []
			
 
				+        for value in schrott_list:
			
 
				+            if value and 'schrott' in value.keys():
			
 
				+                if station is None:
			
 
				+                    station_list.append(value['stationsnummer'])
			
 
				+                    prediction_list.append(value['schrott'] > limit)
			
 
				+                elif value['stationsnummer'] == station:
			
 
				+                    station_list.append(value['stationsnummer'])
			
 
				+                    prediction_list.append(value['schrott'] > limit)
			
 
				+            
			
 
				+
			
 
				+        return (prediction_list, station_list)
			
 
				+
			
 
				+    def calculate_statistics_for_all_predictions(self, data: pd.DataFrame):
			
 
				+
			
 
				+        correct_predictions = data['num_correct_predictions'].sum()
			
 
				+        wrong_predictions = data['num_wrong_predictions'].sum()
			
 
				+
			
 
				+        true_positives =data[data['ist_schrott']==True]['num_correct_predictions'].sum()
			
 
				+        false_negatives = data[data['ist_schrott']==True]['num_wrong_predictions'].sum()
			
 
				+
			
 
				+        true_negatives = data[data['ist_schrott']==False]['num_correct_predictions'].sum()
			
 
				+        false_positives = data[data['ist_schrott']==False]['num_wrong_predictions'].sum()
			
 
				+
			
 
				+        confusion_matrix = [true_negatives, false_positives, false_negatives, true_positives]
			
 
				+        if (true_negatives + false_positives) != 0 and (false_negatives + true_positives) != 0:
			
 
				+            normalized_confusion_matrix = [ round(true_negatives/(true_negatives + false_positives),3),\
			
 
				+                                            round(false_positives/(true_negatives + false_positives),3),\
			
 
				+                                            round(false_negatives/(false_negatives + true_positives),3),\
			
 
				+                                            round(true_positives/(false_negatives + true_positives),3)]
			
 
				+            
			
 
				+        else:
			
 
				+            normalized_confusion_matrix = 0
			
 
				+        accuracy = round(data['num_correct_predictions'].sum()/(data['num_predictions'].sum()), 3) *100
			
 
				+        balanced_accuracy = round( (normalized_confusion_matrix[0]+ normalized_confusion_matrix[-1])/2, 2) *100
			
 
				+
			
 
				+        '''
			
 
				+        print('\n', '*-*'*20)
			
 
				+        print('Statistics for all predictions')
			
 
				+        print('Correct predictions:', correct_predictions)
			
 
				+        print('Wrong predictions:', correct_predictions)
			
 
				+
			
 
				+        print('Confusion Matrix:', confusion_matrix)
			
 
				+        print('Normalized Confusion Matrix:', normalized_confusion_matrix)
			
 
				+        print('Accuracy:', accuracy, '%')
			
 
				+        print('Balanced Accuracy:', balanced_accuracy, '%')
			
 
				+        '''
			
 
				+       
			
 
				+
			
 
				+        return_dict = {
			
 
				+            'correct_predictions': str(correct_predictions),
			
 
				+            'wrong_predictions': str(wrong_predictions),
			
 
				+            'confusion_matrix': [
			
 
				+                                    ('True Negatives', int(confusion_matrix[0])),
			
 
				+                                    ('False Positives', int(confusion_matrix[1])),
			
 
				+                                    ('False Negatives', int(confusion_matrix[2])),
			
 
				+                                    ('True Positives', int(confusion_matrix[3]))
			
 
				+                                ],
			
 
				+            'normalized_confusion_matrix': [
			
 
				+                                                ('True Negatives', round(normalized_confusion_matrix[0]*100,2)),
			
 
				+                                                ('False Positives', round(normalized_confusion_matrix[1]*100,2)),
			
 
				+                                                ('False Negatives', round(normalized_confusion_matrix[2]*100,2)), 
			
 
				+                                                ('True Positives', round(normalized_confusion_matrix[3]*100,2))
			
 
				+                                            ],
			
 
				+            'accuracy': accuracy,
			
 
				+            'balanced_accuracy': balanced_accuracy
			
 
				+        }
			
 
				+
			
 
				+        return return_dict
			
 
				+
			
 
				+    def calculate_statistics_per_wheelset(self, data: pd.DataFrame):
			
 
				+
			
 
				+        num_wheelsets = len(data.index)
			
 
				+
			
 
				+        schrott_data = data[data['ist_schrott'] == True]
			
 
				+        not_schrott_data = data[data['ist_schrott'] == False]
			
 
				+        
			
 
				+        wheelset_true_positives = len(schrott_data[schrott_data['num_correct_predictions']>0].index)
			
 
				+        wheelset_false_positives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']>0].index)
			
 
				+
			
 
				+        wheelset_true_negatives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']==0].index)
			
 
				+        wheelset_false_negatives = len(schrott_data[schrott_data['num_correct_predictions']==0].index)
			
 
				+
			
 
				+        wheelset_correct = wheelset_true_positives + wheelset_true_negatives
			
 
				+        wheelset_wrong = wheelset_false_positives + wheelset_false_negatives
			
 
				+
			
 
				+        wheelset_confusion_matrix = [wheelset_true_negatives, wheelset_false_positives, wheelset_false_negatives, wheelset_true_positives]
			
 
				+
			
 
				+        if (wheelset_true_negatives + wheelset_false_positives) != 0 and (wheelset_false_negatives + wheelset_true_positives) != 0:
			
 
				+            wheelset_normalized_confusion_matrix = [round(wheelset_true_negatives/(wheelset_true_negatives + wheelset_false_positives),3),\
			
 
				+                                                    round(wheelset_false_positives/(wheelset_true_negatives + wheelset_false_positives),3),\
			
 
				+                                                    round(wheelset_false_negatives/(wheelset_false_negatives + wheelset_true_positives),3),\
			
 
				+                                                    round(wheelset_true_positives/(wheelset_false_negatives + wheelset_true_positives),3)]
			
 
				+        else:
			
 
				+            wheelset_normalized_confusion_matrix = 0
			
 
				+
			
 
				+        wheelset_accuracy = round(wheelset_correct/num_wheelsets, 3) *100
			
 
				+        if type(wheelset_normalized_confusion_matrix) == list:
			
 
				+            wheelset_balanced_accuracy = round( (wheelset_normalized_confusion_matrix[0]+ wheelset_normalized_confusion_matrix[-1])/2, 2) *100
			
 
				+        else:
			
 
				+            wheelset_balanced_accuracy = 'could not be calculates'
			
 
				+
			
 
				+        
			
 
				+        print('\n', '*-*'*20)
			
 
				+        print('Statistics per wheelset')
			
 
				+        print('Correct predictions:', wheelset_correct)
			
 
				+        print('Wrong predictions:', wheelset_wrong)
			
 
				+
			
 
				+        print('Confusion Matrix:', [['True Negatives', str(wheelset_confusion_matrix[0])], ['False Positives', str(wheelset_confusion_matrix[1])],\
			
 
				+                                    ['False Negatives', str(wheelset_confusion_matrix[2])], ['True Positives', str(wheelset_confusion_matrix[3])]])
			
 
				+        print('Normalized Confusion Matrix:', [['True Negatives', str(wheelset_normalized_confusion_matrix[0])], ['False Positives', str(wheelset_normalized_confusion_matrix[1])],\
			
 
				+                                                ['False Negatives', str(wheelset_normalized_confusion_matrix[2])], ['True Positives', str(wheelset_normalized_confusion_matrix[3])]])
			
 
				+        print('Accuracy:', wheelset_accuracy, '%')
			
 
				+        print('Balanced Accuracy:', wheelset_balanced_accuracy, '%')
			
 
				+        
			
 
				+        return_dict = {
			
 
				+            'correct_predictions': str(wheelset_correct),
			
 
				+            'wrong_predictions': str(wheelset_wrong),
			
 
				+            'confusion_matrix':[
			
 
				+                                    ('True Negatives', int(wheelset_confusion_matrix[0])),
			
 
				+                                    ('False Positives', int(wheelset_confusion_matrix[1])),
			
 
				+                                    ('False Negatives', int(wheelset_confusion_matrix[2])),
			
 
				+                                    ('True Positives', int(wheelset_confusion_matrix[3]))
			
 
				+                                ],
			
 
				+            'normalized_confusion_matrix': [
			
 
				+                                                ('True Negatives', round(wheelset_normalized_confusion_matrix[0]*100, 2)),
			
 
				+                                                ('False Positives', round(wheelset_normalized_confusion_matrix[1]*100, 2)),
			
 
				+                                                ('False Negatives', round(wheelset_normalized_confusion_matrix[2]*100, 2)), 
			
 
				+                                                ('True Positives', round(wheelset_normalized_confusion_matrix[3]*100, 2))
			
 
				+                                            ],
			
 
				+            'accuracy': wheelset_accuracy,
			
 
				+            'balanced_accuracy': wheelset_balanced_accuracy
			
 
				+        }
			
 
				+
			
 
				+        return return_dict
			
 
				+
			
 
				+
			
 
				+    # Cost for new wheelset - 2000€
			
 
				+    # Average cost for overhauling - ~3h = €250
			
 
				+    # True positive: 250
			
 
				+    # False positive: -2000
			
 
				+    # True negative: 0
			
 
				+    # False negative: -250
			
 
				+
			
 
				+    def custom_cost_function(self, correct_values, predictions, weights_list: list =  [0, -2000, 0, 250], normalized: bool = False):
			
 
				+        from sklearn.metrics import confusion_matrix
			
 
				+        tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
			
 
				+        tn_weight, fp_weight, fn_weight, tp_weight = weights_list
			
 
				+        max_score = (tn+fp)*tn_weight+(tp+fn)*tp_weight
			
 
				+        score = tn*tn_weight + fp*fp_weight + fn*fn_weight + tp*tp_weight
			
 
				+        if normalized:
			
 
				+            return score/max_score
			
 
				+        else:
			
 
				+            return score
			
 
				+
			
 
				+    # old weights_list: list =  [1.2, -500, -0.2, 2]
			
 
				+    def balanced_custom_cost_function(self, correct_values, predictions, weights_list: list =  [0, -2000, 0, 250], normalized: bool = False):
			
 
				+        from sklearn.metrics import confusion_matrix
			
 
				+
			
 
				+        #print(confusion_matrix(correct_values, predictions))
			
 
				+        
			
 
				+        tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
			
 
				+
			
 
				+        tn_weight, fp_weight, fn_weight, tp_weight = weights_list
			
 
				+
			
 
				+        #print(('tn {}, fp {}, fn {}, tp {}').format(tn, fp, fn, tp))
			
 
				+
			
 
				+        max_score_negatives = (tn+fp)*tn_weight
			
 
				+        #print('max_score_negatives', max_score_negatives)
			
 
				+        max_score_positives = (tp+fn)*tp_weight
			
 
				+        #print('max_score_positives', max_score_positives)
			
 
				+        score_negatives = tn*tn_weight + fp*fp_weight
			
 
				+        #print('score_negatives', score_negatives)
			
 
				+        score_positives = fn*fn_weight + tp*tp_weight
			
 
				+        #print('score_positives', score_positives)
			
 
				+
			
 
				+        if normalized:
			
 
				+            if max_score_negatives != 0:
			
 
				+                normalized_negatives = score_negatives / max_score_negatives
			
 
				+                #print('normalized_negatives', normalized_negatives)
			
 
				+            else:
			
 
				+                normalized_negatives = score_negatives / (max_score_negatives+0.00001)
			
 
				+
			
 
				+            if max_score_positives != 0:
			
 
				+                normalized_positives = score_positives / max_score_positives
			
 
				+                #print('normalized_positives', normalized_positives)
			
 
				+            else:
			
 
				+                normalized_positives = score_positives / (max_score_positives+0.00001)
			
 
				+
			
 
				+            return (normalized_negatives + normalized_positives) / 2
			
 
				+            #print('balanced_normalized_score', balanced_normalized_score)
			
 
				+
			
 
				+        else:
			
 
				+
			
 
				+            return (score_negatives + score_positives) / 2
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+
			
 
				+    from libraries.db_handlers.OebbMongodbHandler import OebbMongodbHandler
			
 
				+    mongodb_handler = OebbMongodbHandler()
			
 
				+    data_explorer = DataExplorer()
			
 
				+
			
 
				+    from_date = str(datetime.datetime(2017, 1, 1))
			
 
				+    to_date = str(datetime.datetime(2018, 1, 1))
			
 
				+    
			
 
				+    find_query = {
			
 
				+                    'process.beginn_der_bearbeitung':{
			
 
				+                        '$gt': from_date, 
			
 
				+                        '$lt': to_date
			
 
				+                    },
			
 
				+                    'process.schrott':{
			
 
				+                        '$exists': True
			
 
				+                        }
			
 
				+                }
			
 
				+    data = mongodb_handler.query_data_and_generate_dataframe('process_instances', find_query=find_query, return_values={'radsatznummer':1, 'process.schrott':1, 'process.stationsnummer':1, 'final_state.ist_schrott':1, '_id':0}, index='radsatznummer')
			
 
				+    stations = [421, 110, 130, 140, 680, 410, 510, 520, 320, 480, 490, 595, 535, None]
			
 
				+    #stations = [130, 520, 140, 535, 410, 421, 680, 320, 595, 480, 490, 110, 510, None]
			
 
				+    result_dict = {}
			
 
				+    for station in stations:
			
 
				+        result_dict.update(data_explorer.get_total_prediction_results(data, 0.5, station))#limit/10)
			
 
				+    
			
 
				+    result_df = pd.DataFrame.from_dict(result_dict, orient='index')
			
 
				+    
			
 
				+    mongodb_handler.insert_data_into_collection(result_dict, 'prediction_scores')
			
 
				+
			
 
				+    result_df.index.name = 'station'
			
 
				+    print(result_df)
			
 
				+
			
 
				+    result_df.to_excel(os.path.join('.', 'documentation', 'prediction_scores.xlsx'))