Sfoglia il codice sorgente

Moved DataExplorer from Oebb to CDPLib

ogert 4 anni fa
parent
commit
7ca977d0cf
1 ha cambiato i file con 602 aggiunte e 0 eliminazioni
  1. 602 0
      cdplib/DataExplorer/DataExplorer.py

+ 602 - 0
cdplib/DataExplorer/DataExplorer.py

@@ -0,0 +1,602 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import time
+from pprint import pprint
+import pandas as pd
+import numpy as np
+import datetime
+sys.path.append(os.getcwd())
+from copy import deepcopy
+from cdplib.log import Log
+from cdplib.FlattenData import FlattenData
+from libraries.SimplifiedProcessModel import SimplifiedProcessModel
+from libraries.base_path_prediction.Base_Path_Predictor import Base_Path_Predictor
+from libraries.configuration import default as cfg
+from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
+
+class DataExplorer:
+
+    def __init__(self):
+        self._log = Log("Data Explorer")
+
+    def calculate_correlation(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False):
+
+        no_nan_data = data.fillna(0)
+        correlations = []
+        
+        pearson_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='pearson')
+        self._log.info('Calculated Pearson correlation')
+
+        if verbose:
+            kendall_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='kendall')
+            correlations.append(kendall_correlation)
+            self._log.info('Calculated Kendall correlation')
+
+            spearman_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='spearman')
+            correlations.append(spearman_correlation)
+            self._log.info('Calculated Spearman correlation')
+
+            cosine_similarity = self.calculate_cosine_similarity_for_dataframe_columns(no_nan_data, column_name_to_predict)
+            correlations.append(cosine_similarity)
+            self._log.info('Calculated cosine similarity')
+
+            chi2_independence = self.calculate_chi2_independence_for_dataframe_columns(data, column_name_to_predict, verbose)
+            correlations.append(chi2_independence)
+            self._log.info('Calculated chi2 independence')
+
+            fisher_exact_test = self.calculate_fisher_exact_for_dataframe_column(data, column_name_to_predict)
+            correlations.append(fisher_exact_test)
+            self._log.info('Calculated Fisher Exact Test')
+
+        gert_correlation = self.calculate_feature_and_schrott_occurunces(data, column_name_to_predict, prediction_name, verbose)
+        correlations.append(gert_correlation)
+        self._log.info('Calculated Gert correlation')
+        
+        merged_data = pearson_correlation
+        for correlation in correlations:
+            if len(correlation.index) > 0:
+                merged_data = merged_data.join(correlation, how='left')
+        
+        return merged_data
+     
+
+    def calculate_big_matrix_correlation(self, data: pd.DataFrame, column_name_to_predict: str, method: str='pearson') -> pd.DataFrame():
+
+        num_columns = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']
+        result_data = {}
+        for column in data.columns:
+            if data[column].dtype in num_columns: 
+                result_data[column] = data[column_name_to_predict].corr(data[column], method=method)
+        
+        result_df = pd.DataFrame.from_dict(result_data, orient='index')
+        label_string = method + ' correlation'
+        result_df.columns = [label_string]
+        return result_df
+        
+    def calculate_feature_and_schrott_occurunces(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False) -> pd.DataFrame():
+
+        result_dict = {}
+        len_data = len(data.index)
+        total_schrott = len(data[data[column_name_to_predict] == 1].index)
+        counter = 0
+        for column in data.columns:
+            occurunces = 0
+            schrott_occurunces = 0
+            temp_data = data[data[column] == 1]
+            #print('Temp Data:', temp_data)
+            if len(temp_data.index) > 0:
+                occurunces = len(temp_data.index)
+                schrott_occurunces = len(temp_data[temp_data[column_name_to_predict] == 1].index)
+                schrott_wheelsets = list(temp_data[temp_data[column_name_to_predict] == 1].index)
+                non_schrott = occurunces - schrott_occurunces
+                non_occurunces = len_data - occurunces
+                percentage_of_occurances_schrott = round((schrott_occurunces/occurunces)*100, 2)
+                percentage_of_occurances_not_schrott = round((non_schrott/occurunces)*100, 2)
+                
+                if verbose:
+                    result_dict[column] = {
+                        'Occurs':occurunces,
+                        '%_Occurs': occurunces/len_data,
+
+                        prediction_name + '_Occurs':schrott_occurunces,
+                        '%_' + prediction_name + '_Occurs': percentage_of_occurances_schrott,
+                        '%_Total_' + prediction_name: round((schrott_occurunces/total_schrott)*100, 2),
+
+                        '!Occurs': non_occurunces,
+                        '%_!Occurs': round((non_occurunces/len_data)*100, 2),
+
+                        'Occurs_!' + prediction_name: non_schrott,
+                        '%_Occurs_!' + prediction_name: percentage_of_occurances_not_schrott,
+                        '%_Total_!' + prediction_name: round((non_schrott/len_data)*100, 2),
+
+                        'Wheelsets_!' + prediction_name: [wheelset for wheelset in list(temp_data.index) if wheelset not in schrott_wheelsets],
+                        'Wheelsets_' + prediction_name: schrott_wheelsets
+                    }
+                else:
+                    result_dict[column] = {
+
+                        'Occurs':occurunces,
+                        '%_Occurs': occurunces/len_data,
+
+                        'Schrott_Occurs':schrott_occurunces,
+                        '%_Schrott_Occurs': percentage_of_occurances_schrott,
+                        '%_Total_Schrott': schrott_occurunces/total_schrott,
+
+                    }
+
+                counter+=1
+                if counter % 100 == 0:
+                    print(('Calculated {} / {}').format(counter, len(data.columns)))
+            
+        return pd.DataFrame.from_dict(result_dict, orient='index')
+
+    def calculate_fisher_exact_for_dataframe_column(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
+      
+        from scipy.stats import fisher_exact
+        result_dict = {}
+        predict_count = data[column_name_to_predict]
+
+        for column in data.columns:
+            
+            fisher_data = pd.crosstab(predict_count, data[column])
+            if fisher_data.shape == (2, 2):
+                oddsratio, pvalue = fisher_exact(fisher_data)
+                result_dict[column] = {
+                                        'Fisher_Exact_pvalue': pvalue,
+                                        'Fisher_Exact_oddsratio': oddsratio
+                                        }
+            else:
+                result_dict[column] = {
+                                        'Fisher_Exact_pvalue': None,
+                                        'Fisher_Exact_oddsratio': None
+                                        }
+        return pd.DataFrame.from_dict(result_dict, orient='index')
+        
+
+    def calculate_cosine_similarity_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
+        from sklearn.metrics.pairwise import cosine_similarity
+        cosine_corr = cosine_similarity(data.transpose())
+        cosine_df = pd.DataFrame(cosine_corr, columns=data.columns, index=data.columns)
+        cosine_df = pd.DataFrame(cosine_df[column_name_to_predict])
+        cosine_df.columns = ['Cosine Similarity']
+
+        return cosine_df
+
+    def calculate_chi2_independence_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str, verbose: bool = False) -> pd.DataFrame():
+        from scipy.stats import chi2_contingency
+
+        result_dict = {}
+        Y = data[column_name_to_predict]
+        for column in data.columns:
+            X = data[column]
+
+            observed = pd.crosstab(Y,X)
+            chi2=None
+            p=None
+            dof=None
+            expected=None
+            if observed.size == 4:
+                chi2, p, dof, expected = chi2_contingency(observed.values)
+
+                if dof > 1:
+                    self._log.warning(('Calculation contained DOF > 1, this is the data:\n {}').format(X))
+
+                if verbose:
+                    result_dict[column] = {
+                                            'Chi2': chi2,
+                                            'Chi2, p':p,
+                                            'Chi2, dof':dof,
+                                            'Chi2, expected':expected
+                                        }
+                else:
+                    result_dict[column] = {
+                                            'Chi2, p':p,
+                                        }
+            else:
+                if verbose:
+                    result_dict[column] = {
+                                            'Chi2': None,
+                                            'Chi2, p':None,
+                                            'Chi2, dof':None,
+                                            'Chi2, expected':None
+                                        }
+                else:
+                    result_dict[column] = {
+                                            'Chi2, p':None,
+                                        }
+            
+        return pd.DataFrame.from_dict(result_dict, orient='index')
+
+    def calculate_machine_learning_scores(self, predictions, true_values, y_train, to_date:str = None):
+        '''
+        Calculated several different measures on the predictions compared to the true values, returns 2 DataFrames, one to be submitted to Mongodb and one to be saved to excel.
+        
+        :param predictions: The machine learning predictions
+        :param true_values: The correct predictions for the dataset
+        :param y_train: The correct predictions for the training data.
+        '''
+        from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, balanced_accuracy_score  
+
+        if to_date is None:
+            to_date = str(datetime.datetime.utcnow())
+
+        confusion_matrix = confusion_matrix(true_values, predictions)
+        #normalized_confusion_matrix = confusion_matrix(true_values, predictions, normalize=True)
+        normalized_confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
+        classification_report = classification_report(true_values, predictions)
+        accuracy_score = round(accuracy_score(true_values, predictions)*100, 2)
+        roc_auc_score = round(roc_auc_score(true_values, predictions)*100,2)
+        balanced_custom_cost_score = self.balanced_custom_cost_function(true_values, predictions)
+        custom_cost_score = self.custom_cost_function(true_values, predictions)
+        #roc_auc_score = 'Unknown'
+        balanced_accuracy = round(balanced_accuracy_score(true_values, predictions)*100, 2)
+        
+        
+        print("Timestamp\n", to_date)
+        print('\n')
+        print(('% True Positives = {}').format(confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1][0])))
+        print(('% False Positives = {}').format(confusion_matrix[0][1]/(confusion_matrix[0][1]+confusion_matrix[0][0])))
+        print('\n')
+        print('Accuracy:\n', accuracy_score)
+        print('\n')
+        print('Balanced Accuracy:\n', balanced_accuracy)
+        print('\n')
+        print('ROC AUC Score:\n', roc_auc_score)
+        print('\n')
+        print('Balanced Custom Cost Function:\n', balanced_custom_cost_score)
+        print('\n')
+        print('Custom Cost Function:\n', custom_cost_score)
+        print('\n')
+        print('Confusion Matrix:\n', confusion_matrix)
+        print('\n')
+        print('Normalized Confusion Matrix:\n', normalized_confusion_matrix)
+        print('\n')
+        print('Classification Report:\n', classification_report)
+        
+        result_dict = {
+            'Size of training set:': len(y_train),
+            'Number of Schrott in training set': y_train.value_counts().loc[True],
+            'Size of prediction set:': len(true_values),
+            'Number of Schrott in prediction set': true_values.value_counts().loc[True],
+            'Confusion Matrix': confusion_matrix,
+            'Normalized Confusion Matrix': normalized_confusion_matrix,
+            'Accuracy': accuracy_score,
+            'Balanced Accuracy': balanced_accuracy,
+            'ROC AUC Score:': roc_auc_score,
+            'Balanced Custom Cost Score:': balanced_custom_cost_score,
+            'Custom Cost Score:': custom_cost_score
+        }
+
+        confusion_matrix = confusion_matrix.ravel()
+
+        database_dict = {
+                "timestamp": to_date,
+                "result": {
+                    "training_set":{
+                        "amount": int(len(y_train)),
+                        "schrott": int(y_train.value_counts().loc[True])
+                    },
+                    "test_set":{
+                        "amount": int(len(true_values)),
+                        "schrott": int(true_values.value_counts().loc[True])
+                    },
+                    "confusion_matrix":[
+                        ("True Negatives", int(confusion_matrix[0])),
+                        ("False Positives", int(confusion_matrix[1])),
+                        ("False Negatives", int(confusion_matrix[2])),
+                        ("True Positives", int(confusion_matrix[3]))
+                    ],
+                    "normalized_confusion_matrix":[
+                        ("True Negatives", round(normalized_confusion_matrix[0][0]*100, 2)),
+                        ("False Positives", round(normalized_confusion_matrix[0][1]*100, 2)),
+                        ("False Negatives", round(normalized_confusion_matrix[1][0]*100, 2)),
+                        ("True Positives", round(normalized_confusion_matrix[1][1]*100, 2))
+                    ],
+                    "accuracy": accuracy_score,
+                    "balanced_accuracy": balanced_accuracy,
+                    "ROC_AUC_score": roc_auc_score,
+                    'balanced_custom_cost_score': int(balanced_custom_cost_score),
+                    'custom_cost_score': int(custom_cost_score)
+                }   
+        }
+
+        pprint(database_dict)
+        
+        #result_dict={}
+        #database_dict={}
+        return result_dict, database_dict
+
+    def get_total_prediction_results(self, in_data, limit: float = 0.5, station = None):
+
+        data = deepcopy(in_data)
+
+        data_flattener = FlattenData()
+        data['ist_schrott'] = data_flattener.flatten(data['final_state'])
+        predictions_lists = []  
+        stations_lists = []
+        num_correct_predictions = []
+        num_wrong_predictions = []
+        
+        for true_value, predictions in zip(data['ist_schrott'], data['process']):
+
+            predictions_list, stations_list= self.filter_by_threshold_and_station(predictions, limit, station)
+            predictions_lists.append(predictions_list)
+            stations_lists.append(stations_list)
+
+            num_true = sum(predictions_list)
+            num_false = len(predictions_list) - sum(predictions_list)
+            if true_value:
+                
+                num_correct_predictions.append(num_true)
+                num_wrong_predictions.append(num_false)
+            else:
+                num_correct_predictions.append(num_false)
+                num_wrong_predictions.append(num_true)
+            
+        # Fill DataFrame
+        data['stations'] = stations_lists
+        data['predictions'] = predictions_lists
+        data['num_predictions'] = data['predictions'].str.len()
+        data['num_correct_predictions'] = num_correct_predictions
+        data['%_correct_predictions'] = round(data['num_correct_predictions'] / data['num_predictions'], 3)*100
+        data['num_wrong_predictions'] = num_wrong_predictions
+        data['%_wrong_predictions'] = round(data['num_wrong_predictions'] / data['num_predictions'], 3)*100
+        '''
+        print('LIMIT:', limit)
+        num_wheelsets = len(data.index) 
+
+        # Print results
+        print('Predictions for station:', station if not None else 'All stations')
+        print('Wheelsets predicted:', num_wheelsets)
+        print('Predictions made:', data['num_predictions'].sum())
+        '''
+        return_dict = {}
+
+        if station is None:
+            station = 'All Stations'
+            return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
+            return_dict[station+' per Wheelset'] = {'result': self.calculate_statistics_per_wheelset(data)}
+        else:
+            return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
+
+        data.drop(['final_state', 'process'], axis=1, inplace=True)
+
+        return return_dict
+
+    def filter_by_threshold_and_station(self, schrott_list: list, limit: float = 0.5, station=None):
+    
+        
+        prediction_list = []
+        station_list = []
+        for value in schrott_list:
+            if value and 'schrott' in value.keys():
+                if station is None:
+                    station_list.append(value['stationsnummer'])
+                    prediction_list.append(value['schrott'] > limit)
+                elif value['stationsnummer'] == station:
+                    station_list.append(value['stationsnummer'])
+                    prediction_list.append(value['schrott'] > limit)
+            
+
+        return (prediction_list, station_list)
+
+    def calculate_statistics_for_all_predictions(self, data: pd.DataFrame):
+
+        correct_predictions = data['num_correct_predictions'].sum()
+        wrong_predictions = data['num_wrong_predictions'].sum()
+
+        true_positives =data[data['ist_schrott']==True]['num_correct_predictions'].sum()
+        false_negatives = data[data['ist_schrott']==True]['num_wrong_predictions'].sum()
+
+        true_negatives = data[data['ist_schrott']==False]['num_correct_predictions'].sum()
+        false_positives = data[data['ist_schrott']==False]['num_wrong_predictions'].sum()
+
+        confusion_matrix = [true_negatives, false_positives, false_negatives, true_positives]
+        if (true_negatives + false_positives) != 0 and (false_negatives + true_positives) != 0:
+            normalized_confusion_matrix = [ round(true_negatives/(true_negatives + false_positives),3),\
+                                            round(false_positives/(true_negatives + false_positives),3),\
+                                            round(false_negatives/(false_negatives + true_positives),3),\
+                                            round(true_positives/(false_negatives + true_positives),3)]
+            
+        else:
+            normalized_confusion_matrix = 0
+        accuracy = round(data['num_correct_predictions'].sum()/(data['num_predictions'].sum()), 3) *100
+        balanced_accuracy = round( (normalized_confusion_matrix[0]+ normalized_confusion_matrix[-1])/2, 2) *100
+
+        '''
+        print('\n', '*-*'*20)
+        print('Statistics for all predictions')
+        print('Correct predictions:', correct_predictions)
+        print('Wrong predictions:', correct_predictions)
+
+        print('Confusion Matrix:', confusion_matrix)
+        print('Normalized Confusion Matrix:', normalized_confusion_matrix)
+        print('Accuracy:', accuracy, '%')
+        print('Balanced Accuracy:', balanced_accuracy, '%')
+        '''
+       
+
+        return_dict = {
+            'correct_predictions': str(correct_predictions),
+            'wrong_predictions': str(wrong_predictions),
+            'confusion_matrix': [
+                                    ('True Negatives', int(confusion_matrix[0])),
+                                    ('False Positives', int(confusion_matrix[1])),
+                                    ('False Negatives', int(confusion_matrix[2])),
+                                    ('True Positives', int(confusion_matrix[3]))
+                                ],
+            'normalized_confusion_matrix': [
+                                                ('True Negatives', round(normalized_confusion_matrix[0]*100,2)),
+                                                ('False Positives', round(normalized_confusion_matrix[1]*100,2)),
+                                                ('False Negatives', round(normalized_confusion_matrix[2]*100,2)), 
+                                                ('True Positives', round(normalized_confusion_matrix[3]*100,2))
+                                            ],
+            'accuracy': accuracy,
+            'balanced_accuracy': balanced_accuracy
+        }
+
+        return return_dict
+
+    def calculate_statistics_per_wheelset(self, data: pd.DataFrame):
+
+        num_wheelsets = len(data.index)
+
+        schrott_data = data[data['ist_schrott'] == True]
+        not_schrott_data = data[data['ist_schrott'] == False]
+        
+        wheelset_true_positives = len(schrott_data[schrott_data['num_correct_predictions']>0].index)
+        wheelset_false_positives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']>0].index)
+
+        wheelset_true_negatives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']==0].index)
+        wheelset_false_negatives = len(schrott_data[schrott_data['num_correct_predictions']==0].index)
+
+        wheelset_correct = wheelset_true_positives + wheelset_true_negatives
+        wheelset_wrong = wheelset_false_positives + wheelset_false_negatives
+
+        wheelset_confusion_matrix = [wheelset_true_negatives, wheelset_false_positives, wheelset_false_negatives, wheelset_true_positives]
+
+        if (wheelset_true_negatives + wheelset_false_positives) != 0 and (wheelset_false_negatives + wheelset_true_positives) != 0:
+            wheelset_normalized_confusion_matrix = [round(wheelset_true_negatives/(wheelset_true_negatives + wheelset_false_positives),3),\
+                                                    round(wheelset_false_positives/(wheelset_true_negatives + wheelset_false_positives),3),\
+                                                    round(wheelset_false_negatives/(wheelset_false_negatives + wheelset_true_positives),3),\
+                                                    round(wheelset_true_positives/(wheelset_false_negatives + wheelset_true_positives),3)]
+        else:
+            wheelset_normalized_confusion_matrix = 0
+
+        wheelset_accuracy = round(wheelset_correct/num_wheelsets, 3) *100
+        if type(wheelset_normalized_confusion_matrix) == list:
+            wheelset_balanced_accuracy = round( (wheelset_normalized_confusion_matrix[0]+ wheelset_normalized_confusion_matrix[-1])/2, 2) *100
+        else:
+            wheelset_balanced_accuracy = 'could not be calculates'
+
+        
+        print('\n', '*-*'*20)
+        print('Statistics per wheelset')
+        print('Correct predictions:', wheelset_correct)
+        print('Wrong predictions:', wheelset_wrong)
+
+        print('Confusion Matrix:', [['True Negatives', str(wheelset_confusion_matrix[0])], ['False Positives', str(wheelset_confusion_matrix[1])],\
+                                    ['False Negatives', str(wheelset_confusion_matrix[2])], ['True Positives', str(wheelset_confusion_matrix[3])]])
+        print('Normalized Confusion Matrix:', [['True Negatives', str(wheelset_normalized_confusion_matrix[0])], ['False Positives', str(wheelset_normalized_confusion_matrix[1])],\
+                                                ['False Negatives', str(wheelset_normalized_confusion_matrix[2])], ['True Positives', str(wheelset_normalized_confusion_matrix[3])]])
+        print('Accuracy:', wheelset_accuracy, '%')
+        print('Balanced Accuracy:', wheelset_balanced_accuracy, '%')
+        
+        return_dict = {
+            'correct_predictions': str(wheelset_correct),
+            'wrong_predictions': str(wheelset_wrong),
+            'confusion_matrix':[
+                                    ('True Negatives', int(wheelset_confusion_matrix[0])),
+                                    ('False Positives', int(wheelset_confusion_matrix[1])),
+                                    ('False Negatives', int(wheelset_confusion_matrix[2])),
+                                    ('True Positives', int(wheelset_confusion_matrix[3]))
+                                ],
+            'normalized_confusion_matrix': [
+                                                ('True Negatives', round(wheelset_normalized_confusion_matrix[0]*100, 2)),
+                                                ('False Positives', round(wheelset_normalized_confusion_matrix[1]*100, 2)),
+                                                ('False Negatives', round(wheelset_normalized_confusion_matrix[2]*100, 2)), 
+                                                ('True Positives', round(wheelset_normalized_confusion_matrix[3]*100, 2))
+                                            ],
+            'accuracy': wheelset_accuracy,
+            'balanced_accuracy': wheelset_balanced_accuracy
+        }
+
+        return return_dict
+
+
+    # Cost for new wheelset - 2000€
+    # Average cost for overhauling - ~3h = €250
+    # True positive: 250
+    # False positive: -2000
+    # True negative: 0
+    # False negative: -250
+
+    def custom_cost_function(self, correct_values, predictions, weights_list: list =  [0, -2000, 0, 250], normalized: bool = False):
+        from sklearn.metrics import confusion_matrix
+        tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
+        tn_weight, fp_weight, fn_weight, tp_weight = weights_list
+        max_score = (tn+fp)*tn_weight+(tp+fn)*tp_weight
+        score = tn*tn_weight + fp*fp_weight + fn*fn_weight + tp*tp_weight
+        if normalized:
+            return score/max_score
+        else:
+            return score
+
+    # old weights_list: list =  [1.2, -500, -0.2, 2]
+    def balanced_custom_cost_function(self, correct_values, predictions, weights_list: list =  [0, -2000, 0, 250], normalized: bool = False):
+        from sklearn.metrics import confusion_matrix
+
+        #print(confusion_matrix(correct_values, predictions))
+        
+        tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
+
+        tn_weight, fp_weight, fn_weight, tp_weight = weights_list
+
+        #print(('tn {}, fp {}, fn {}, tp {}').format(tn, fp, fn, tp))
+
+        max_score_negatives = (tn+fp)*tn_weight
+        #print('max_score_negatives', max_score_negatives)
+        max_score_positives = (tp+fn)*tp_weight
+        #print('max_score_positives', max_score_positives)
+        score_negatives = tn*tn_weight + fp*fp_weight
+        #print('score_negatives', score_negatives)
+        score_positives = fn*fn_weight + tp*tp_weight
+        #print('score_positives', score_positives)
+
+        if normalized:
+            if max_score_negatives != 0:
+                normalized_negatives = score_negatives / max_score_negatives
+                #print('normalized_negatives', normalized_negatives)
+            else:
+                normalized_negatives = score_negatives / (max_score_negatives+0.00001)
+
+            if max_score_positives != 0:
+                normalized_positives = score_positives / max_score_positives
+                #print('normalized_positives', normalized_positives)
+            else:
+                normalized_positives = score_positives / (max_score_positives+0.00001)
+
+            return (normalized_negatives + normalized_positives) / 2
+            #print('balanced_normalized_score', balanced_normalized_score)
+
+        else:
+
+            return (score_negatives + score_positives) / 2
+
+if __name__ == "__main__":
+
+
+    from libraries.db_handlers.OebbMongodbHandler import OebbMongodbHandler
+    mongodb_handler = OebbMongodbHandler()
+    data_explorer = DataExplorer()
+
+    from_date = str(datetime.datetime(2017, 1, 1))
+    to_date = str(datetime.datetime(2018, 1, 1))
+    
+    find_query = {
+                    'process.beginn_der_bearbeitung':{
+                        '$gt': from_date, 
+                        '$lt': to_date
+                    },
+                    'process.schrott':{
+                        '$exists': True
+                        }
+                }
+    data = mongodb_handler.query_data_and_generate_dataframe('process_instances', find_query=find_query, return_values={'radsatznummer':1, 'process.schrott':1, 'process.stationsnummer':1, 'final_state.ist_schrott':1, '_id':0}, index='radsatznummer')
+    stations = [421, 110, 130, 140, 680, 410, 510, 520, 320, 480, 490, 595, 535, None]
+    #stations = [130, 520, 140, 535, 410, 421, 680, 320, 595, 480, 490, 110, 510, None]
+    result_dict = {}
+    for station in stations:
+        result_dict.update(data_explorer.get_total_prediction_results(data, 0.5, station))#limit/10)
+    
+    result_df = pd.DataFrame.from_dict(result_dict, orient='index')
+    
+    mongodb_handler.insert_data_into_collection(result_dict, 'prediction_scores')
+
+    result_df.index.name = 'station'
+    print(result_df)
+
+    result_df.to_excel(os.path.join('.', 'documentation', 'prediction_scores.xlsx'))