tanja
/
cdplib


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import os
import time
from pprint import pprint
import pandas as pd
import numpy as np
import datetime
sys.path.append(os.getcwd())
from copy import deepcopy
from cdplib.log import Log
from cdplib.FlattenData import FlattenData
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score

class DataExplorer:

    def __init__(self):
        self._log = Log("Data Explorer")

    def calculate_correlation(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False):

        no_nan_data = data.fillna(0)
        correlations = []
        
        pearson_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='pearson')
        self._log.info('Calculated Pearson correlation')

        if verbose:
            kendall_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='kendall')
            correlations.append(kendall_correlation)
            self._log.info('Calculated Kendall correlation')

            spearman_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict,  method='spearman')
            correlations.append(spearman_correlation)
            self._log.info('Calculated Spearman correlation')

            cosine_similarity = self.calculate_cosine_similarity_for_dataframe_columns(no_nan_data, column_name_to_predict)
            correlations.append(cosine_similarity)
            self._log.info('Calculated cosine similarity')

            chi2_independence = self.calculate_chi2_independence_for_dataframe_columns(data, column_name_to_predict, verbose)
            correlations.append(chi2_independence)
            self._log.info('Calculated chi2 independence')

            fisher_exact_test = self.calculate_fisher_exact_for_dataframe_column(data, column_name_to_predict)
            correlations.append(fisher_exact_test)
            self._log.info('Calculated Fisher Exact Test')

        gert_correlation = self.calculate_feature_and_schrott_occurunces(data, column_name_to_predict, prediction_name, verbose)
        correlations.append(gert_correlation)
        self._log.info('Calculated Gert correlation')
        
        merged_data = pearson_correlation
        for correlation in correlations:
            if len(correlation.index) > 0:
                merged_data = merged_data.join(correlation, how='left')
        
        return merged_data
     

    def calculate_big_matrix_correlation(self, data: pd.DataFrame, column_name_to_predict: str, method: str='pearson') -> pd.DataFrame():

        num_columns = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']
        result_data = {}
        for column in data.columns:
            if data[column].dtype in num_columns: 
                result_data[column] = data[column_name_to_predict].corr(data[column], method=method)
        
        result_df = pd.DataFrame.from_dict(result_data, orient='index')
        label_string = method + ' correlation'
        result_df.columns = [label_string]
        return result_df
        
    def calculate_feature_and_schrott_occurunces(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False) -> pd.DataFrame():

        result_dict = {}
        len_data = len(data.index)
        total_schrott = len(data[data[column_name_to_predict] == 1].index)
        counter = 0
        for column in data.columns:
            occurunces = 0
            schrott_occurunces = 0
            temp_data = data[data[column] == 1]
            #print('Temp Data:', temp_data)
            if len(temp_data.index) > 0:
                occurunces = len(temp_data.index)
                schrott_occurunces = len(temp_data[temp_data[column_name_to_predict] == 1].index)
                schrott_wheelsets = list(temp_data[temp_data[column_name_to_predict] == 1].index)
                non_schrott = occurunces - schrott_occurunces
                non_occurunces = len_data - occurunces
                percentage_of_occurances_schrott = round((schrott_occurunces/occurunces)*100, 2)
                percentage_of_occurances_not_schrott = round((non_schrott/occurunces)*100, 2)
                
                if verbose:
                    result_dict[column] = {
                        'Occurs':occurunces,
                        '%_Occurs': occurunces/len_data,

                        prediction_name + '_Occurs':schrott_occurunces,
                        '%_' + prediction_name + '_Occurs': percentage_of_occurances_schrott,
                        '%_Total_' + prediction_name: round((schrott_occurunces/total_schrott)*100, 2),

                        '!Occurs': non_occurunces,
                        '%_!Occurs': round((non_occurunces/len_data)*100, 2),

                        'Occurs_!' + prediction_name: non_schrott,
                        '%_Occurs_!' + prediction_name: percentage_of_occurances_not_schrott,
                        '%_Total_!' + prediction_name: round((non_schrott/len_data)*100, 2),

                        'Wheelsets_!' + prediction_name: [wheelset for wheelset in list(temp_data.index) if wheelset not in schrott_wheelsets],
                        'Wheelsets_' + prediction_name: schrott_wheelsets
                    }
                else:
                    result_dict[column] = {

                        'Occurs':occurunces,
                        '%_Occurs': occurunces/len_data,

                        'Schrott_Occurs':schrott_occurunces,
                        '%_Schrott_Occurs': percentage_of_occurances_schrott,
                        '%_Total_Schrott': schrott_occurunces/total_schrott,

                    }

                counter+=1
                if counter % 100 == 0:
                    print(('Calculated {} / {}').format(counter, len(data.columns)))
            
        return pd.DataFrame.from_dict(result_dict, orient='index')

    def calculate_fisher_exact_for_dataframe_column(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
      
        from scipy.stats import fisher_exact
        result_dict = {}
        predict_count = data[column_name_to_predict]

        for column in data.columns:
            
            fisher_data = pd.crosstab(predict_count, data[column])
            if fisher_data.shape == (2, 2):
                oddsratio, pvalue = fisher_exact(fisher_data)
                result_dict[column] = {
                                        'Fisher_Exact_pvalue': pvalue,
                                        'Fisher_Exact_oddsratio': oddsratio
                                        }
            else:
                result_dict[column] = {
                                        'Fisher_Exact_pvalue': None,
                                        'Fisher_Exact_oddsratio': None
                                        }
        return pd.DataFrame.from_dict(result_dict, orient='index')
        

    def calculate_cosine_similarity_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
        from sklearn.metrics.pairwise import cosine_similarity
        cosine_corr = cosine_similarity(data.transpose())
        cosine_df = pd.DataFrame(cosine_corr, columns=data.columns, index=data.columns)
        cosine_df = pd.DataFrame(cosine_df[column_name_to_predict])
        cosine_df.columns = ['Cosine Similarity']

        return cosine_df

    def calculate_chi2_independence_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str, verbose: bool = False) -> pd.DataFrame():
        from scipy.stats import chi2_contingency

        result_dict = {}
        Y = data[column_name_to_predict]
        for column in data.columns:
            X = data[column]

            observed = pd.crosstab(Y,X)
            chi2=None
            p=None
            dof=None
            expected=None
            if observed.size == 4:
                chi2, p, dof, expected = chi2_contingency(observed.values)

                if dof > 1:
                    self._log.warning(('Calculation contained DOF > 1, this is the data:\n {}').format(X))

                if verbose:
                    result_dict[column] = {
                                            'Chi2': chi2,
                                            'Chi2, p':p,
                                            'Chi2, dof':dof,
                                            'Chi2, expected':expected
                                        }
                else:
                    result_dict[column] = {
                                            'Chi2, p':p,
                                        }
            else:
                if verbose:
                    result_dict[column] = {
                                            'Chi2': None,
                                            'Chi2, p':None,
                                            'Chi2, dof':None,
                                            'Chi2, expected':None
                                        }
                else:
                    result_dict[column] = {
                                            'Chi2, p':None,
                                        }
            
        return pd.DataFrame.from_dict(result_dict, orient='index')

    def calculate_machine_learning_scores(self, predictions, true_values, y_train, to_date:str = None):
        '''
        Calculated several different measures on the predictions compared to the true values, returns 2 DataFrames, one to be submitted to Mongodb and one to be saved to excel.
        
        :param predictions: The machine learning predictions
        :param true_values: The correct predictions for the dataset
        :param y_train: The correct predictions for the training data.
        '''
        from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, balanced_accuracy_score  

        if to_date is None:
            to_date = str(datetime.datetime.utcnow())

        confusion_matrix = confusion_matrix(true_values, predictions)
        #normalized_confusion_matrix = confusion_matrix(true_values, predictions, normalize=True)
        normalized_confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
        classification_report = classification_report(true_values, predictions)
        accuracy_score = round(accuracy_score(true_values, predictions)*100, 2)
        roc_auc_score = round(roc_auc_score(true_values, predictions)*100,2)
        balanced_custom_cost_score = self.balanced_custom_cost_function(true_values, predictions)
        custom_cost_score = self.custom_cost_function(true_values, predictions)
        #roc_auc_score = 'Unknown'
        balanced_accuracy = round(balanced_accuracy_score(true_values, predictions)*100, 2)
        
        
        print("Timestamp\n", to_date)
        print('\n')
        print(('% True Positives = {}').format(confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1][0])))
        print(('% False Positives = {}').format(confusion_matrix[0][1]/(confusion_matrix[0][1]+confusion_matrix[0][0])))
        print('\n')
        print('Accuracy:\n', accuracy_score)
        print('\n')
        print('Balanced Accuracy:\n', balanced_accuracy)
        print('\n')
        print('ROC AUC Score:\n', roc_auc_score)
        print('\n')
        print('Balanced Custom Cost Function:\n', balanced_custom_cost_score)
        print('\n')
        print('Custom Cost Function:\n', custom_cost_score)
        print('\n')
        print('Confusion Matrix:\n', confusion_matrix)
        print('\n')
        print('Normalized Confusion Matrix:\n', normalized_confusion_matrix)
        print('\n')
        print('Classification Report:\n', classification_report)
        
        result_dict = {
            'Size of training set:': len(y_train),
            'Number of Schrott in training set': y_train.value_counts().loc[True],
            'Size of prediction set:': len(true_values),
            'Number of Schrott in prediction set': true_values.value_counts().loc[True],
            'Confusion Matrix': confusion_matrix,
            'Normalized Confusion Matrix': normalized_confusion_matrix,
            'Accuracy': accuracy_score,
            'Balanced Accuracy': balanced_accuracy,
            'ROC AUC Score:': roc_auc_score,
            'Balanced Custom Cost Score:': balanced_custom_cost_score,
            'Custom Cost Score:': custom_cost_score
        }

        confusion_matrix = confusion_matrix.ravel()

        database_dict = {
                "timestamp": to_date,
                "result": {
                    "training_set":{
                        "amount": int(len(y_train)),
                        "schrott": int(y_train.value_counts().loc[True])
                    },
                    "test_set":{
                        "amount": int(len(true_values)),
                        "schrott": int(true_values.value_counts().loc[True])
                    },
                    "confusion_matrix":[
                        ("True Negatives", int(confusion_matrix[0])),
                        ("False Positives", int(confusion_matrix[1])),
                        ("False Negatives", int(confusion_matrix[2])),
                        ("True Positives", int(confusion_matrix[3]))
                    ],
                    "normalized_confusion_matrix":[
                        ("True Negatives", round(normalized_confusion_matrix[0][0]*100, 2)),
                        ("False Positives", round(normalized_confusion_matrix[0][1]*100, 2)),
                        ("False Negatives", round(normalized_confusion_matrix[1][0]*100, 2)),
                        ("True Positives", round(normalized_confusion_matrix[1][1]*100, 2))
                    ],
                    "accuracy": accuracy_score,
                    "balanced_accuracy": balanced_accuracy,
                    "ROC_AUC_score": roc_auc_score,
                    'balanced_custom_cost_score': int(balanced_custom_cost_score),
                    'custom_cost_score': int(custom_cost_score)
                }   
        }

        pprint(database_dict)
        
        #result_dict={}
        #database_dict={}
        return result_dict, database_dict

    def get_total_prediction_results(self, in_data, limit: float = 0.5, station = None):

        data = deepcopy(in_data)

        data_flattener = FlattenData()
        data['ist_schrott'] = data_flattener.flatten(data['final_state'])
        predictions_lists = []  
        stations_lists = []
        num_correct_predictions = []
        num_wrong_predictions = []
        
        for true_value, predictions in zip(data['ist_schrott'], data['process']):

            predictions_list, stations_list= self.filter_by_threshold_and_station(predictions, limit, station)
            predictions_lists.append(predictions_list)
            stations_lists.append(stations_list)

            num_true = sum(predictions_list)
            num_false = len(predictions_list) - sum(predictions_list)
            if true_value:
                
                num_correct_predictions.append(num_true)
                num_wrong_predictions.append(num_false)
            else:
                num_correct_predictions.append(num_false)
                num_wrong_predictions.append(num_true)
            
        # Fill DataFrame
        data['stations'] = stations_lists
        data['predictions'] = predictions_lists
        data['num_predictions'] = data['predictions'].str.len()
        data['num_correct_predictions'] = num_correct_predictions
        data['%_correct_predictions'] = round(data['num_correct_predictions'] / data['num_predictions'], 3)*100
        data['num_wrong_predictions'] = num_wrong_predictions
        data['%_wrong_predictions'] = round(data['num_wrong_predictions'] / data['num_predictions'], 3)*100
        '''
        print('LIMIT:', limit)
        num_wheelsets = len(data.index) 

        # Print results
        print('Predictions for station:', station if not None else 'All stations')
        print('Wheelsets predicted:', num_wheelsets)
        print('Predictions made:', data['num_predictions'].sum())
        '''
        return_dict = {}

        if station is None:
            station = 'All Stations'
            return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
            return_dict[station+' per Wheelset'] = {'result': self.calculate_statistics_per_wheelset(data)}
        else:
            return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }

        data.drop(['final_state', 'process'], axis=1, inplace=True)

        return return_dict

    def filter_by_threshold_and_station(self, schrott_list: list, limit: float = 0.5, station=None):
    
        
        prediction_list = []
        station_list = []
        for value in schrott_list:
            if value and 'schrott' in value.keys():
                if station is None:
                    station_list.append(value['stationsnummer'])
                    prediction_list.append(value['schrott'] > limit)
                elif value['stationsnummer'] == station:
                    station_list.append(value['stationsnummer'])
                    prediction_list.append(value['schrott'] > limit)
            

        return (prediction_list, station_list)

    def calculate_statistics_for_all_predictions(self, data: pd.DataFrame):

        correct_predictions = data['num_correct_predictions'].sum()
        wrong_predictions = data['num_wrong_predictions'].sum()

        true_positives =data[data['ist_schrott']==True]['num_correct_predictions'].sum()
        false_negatives = data[data['ist_schrott']==True]['num_wrong_predictions'].sum()

        true_negatives = data[data['ist_schrott']==False]['num_correct_predictions'].sum()
        false_positives = data[data['ist_schrott']==False]['num_wrong_predictions'].sum()

        confusion_matrix = [true_negatives, false_positives, false_negatives, true_positives]
        if (true_negatives + false_positives) != 0 and (false_negatives + true_positives) != 0:
            normalized_confusion_matrix = [ round(true_negatives/(true_negatives + false_positives),3),\
                                            round(false_positives/(true_negatives + false_positives),3),\
                                            round(false_negatives/(false_negatives + true_positives),3),\
                                            round(true_positives/(false_negatives + true_positives),3)]
            
        else:
            normalized_confusion_matrix = 0
        accuracy = round(data['num_correct_predictions'].sum()/(data['num_predictions'].sum()), 3) *100
        balanced_accuracy = round( (normalized_confusion_matrix[0]+ normalized_confusion_matrix[-1])/2, 2) *100

        '''
        print('\n', '*-*'*20)
        print('Statistics for all predictions')
        print('Correct predictions:', correct_predictions)
        print('Wrong predictions:', correct_predictions)

        print('Confusion Matrix:', confusion_matrix)
        print('Normalized Confusion Matrix:', normalized_confusion_matrix)
        print('Accuracy:', accuracy, '%')
        print('Balanced Accuracy:', balanced_accuracy, '%')
        '''
       

        return_dict = {
            'correct_predictions': str(correct_predictions),
            'wrong_predictions': str(wrong_predictions),
            'confusion_matrix': [
                                    ('True Negatives', int(confusion_matrix[0])),
                                    ('False Positives', int(confusion_matrix[1])),
                                    ('False Negatives', int(confusion_matrix[2])),
                                    ('True Positives', int(confusion_matrix[3]))
                                ],
            'normalized_confusion_matrix': [
                                                ('True Negatives', round(normalized_confusion_matrix[0]*100,2)),
                                                ('False Positives', round(normalized_confusion_matrix[1]*100,2)),
                                                ('False Negatives', round(normalized_confusion_matrix[2]*100,2)), 
                                                ('True Positives', round(normalized_confusion_matrix[3]*100,2))
                                            ],
            'accuracy': accuracy,
            'balanced_accuracy': balanced_accuracy
        }

        return return_dict

    def calculate_statistics_per_wheelset(self, data: pd.DataFrame):

        num_wheelsets = len(data.index)

        schrott_data = data[data['ist_schrott'] == True]
        not_schrott_data = data[data['ist_schrott'] == False]
        
        wheelset_true_positives = len(schrott_data[schrott_data['num_correct_predictions']>0].index)
        wheelset_false_positives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']>0].index)

        wheelset_true_negatives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']==0].index)
        wheelset_false_negatives = len(schrott_data[schrott_data['num_correct_predictions']==0].index)

        wheelset_correct = wheelset_true_positives + wheelset_true_negatives
        wheelset_wrong = wheelset_false_positives + wheelset_false_negatives

        wheelset_confusion_matrix = [wheelset_true_negatives, wheelset_false_positives, wheelset_false_negatives, wheelset_true_positives]

        if (wheelset_true_negatives + wheelset_false_positives) != 0 and (wheelset_false_negatives + wheelset_true_positives) != 0:
            wheelset_normalized_confusion_matrix = [round(wheelset_true_negatives/(wheelset_true_negatives + wheelset_false_positives),3),\
                                                    round(wheelset_false_positives/(wheelset_true_negatives + wheelset_false_positives),3),\
                                                    round(wheelset_false_negatives/(wheelset_false_negatives + wheelset_true_positives),3),\
                                                    round(wheelset_true_positives/(wheelset_false_negatives + wheelset_true_positives),3)]
        else:
            wheelset_normalized_confusion_matrix = 0

        wheelset_accuracy = round(wheelset_correct/num_wheelsets, 3) *100
        if type(wheelset_normalized_confusion_matrix) == list:
            wheelset_balanced_accuracy = round( (wheelset_normalized_confusion_matrix[0]+ wheelset_normalized_confusion_matrix[-1])/2, 2) *100
        else:
            wheelset_balanced_accuracy = 'could not be calculates'

        
        print('\n', '*-*'*20)
        print('Statistics per wheelset')
        print('Correct predictions:', wheelset_correct)
        print('Wrong predictions:', wheelset_wrong)

        print('Confusion Matrix:', [['True Negatives', str(wheelset_confusion_matrix[0])], ['False Positives', str(wheelset_confusion_matrix[1])],\
                                    ['False Negatives', str(wheelset_confusion_matrix[2])], ['True Positives', str(wheelset_confusion_matrix[3])]])
        print('Normalized Confusion Matrix:', [['True Negatives', str(wheelset_normalized_confusion_matrix[0])], ['False Positives', str(wheelset_normalized_confusion_matrix[1])],\
                                                ['False Negatives', str(wheelset_normalized_confusion_matrix[2])], ['True Positives', str(wheelset_normalized_confusion_matrix[3])]])
        print('Accuracy:', wheelset_accuracy, '%')
        print('Balanced Accuracy:', wheelset_balanced_accuracy, '%')
        
        return_dict = {
            'correct_predictions': str(wheelset_correct),
            'wrong_predictions': str(wheelset_wrong),
            'confusion_matrix':[
                                    ('True Negatives', int(wheelset_confusion_matrix[0])),
                                    ('False Positives', int(wheelset_confusion_matrix[1])),
                                    ('False Negatives', int(wheelset_confusion_matrix[2])),
                                    ('True Positives', int(wheelset_confusion_matrix[3]))
                                ],
            'normalized_confusion_matrix': [
                                                ('True Negatives', round(wheelset_normalized_confusion_matrix[0]*100, 2)),
                                                ('False Positives', round(wheelset_normalized_confusion_matrix[1]*100, 2)),
                                                ('False Negatives', round(wheelset_normalized_confusion_matrix[2]*100, 2)), 
                                                ('True Positives', round(wheelset_normalized_confusion_matrix[3]*100, 2))
                                            ],
            'accuracy': wheelset_accuracy,
            'balanced_accuracy': wheelset_balanced_accuracy
        }

        return return_dict


    # Cost for new wheelset - 2000€
    # Average cost for overhauling - ~3h = €250
    # True positive: 250
    # False positive: -2000
    # True negative: 0
    # False negative: -250

    def custom_cost_function(self, correct_values, predictions, weights_list: list =  [0, -2000, 0, 250], normalized: bool = False):
        from sklearn.metrics import confusion_matrix
        tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
        tn_weight, fp_weight, fn_weight, tp_weight = weights_list
        max_score = (tn+fp)*tn_weight+(tp+fn)*tp_weight
        score = tn*tn_weight + fp*fp_weight + fn*fn_weight + tp*tp_weight
        if normalized:
            return score/max_score
        else:
            return score

    # old weights_list: list =  [1.2, -500, -0.2, 2]
    def balanced_custom_cost_function(self, correct_values, predictions, weights_list: list =  [0, -2000, 0, 250], normalized: bool = False):
        from sklearn.metrics import confusion_matrix

        #print(confusion_matrix(correct_values, predictions))
        
        tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()

        tn_weight, fp_weight, fn_weight, tp_weight = weights_list

        #print(('tn {}, fp {}, fn {}, tp {}').format(tn, fp, fn, tp))

        max_score_negatives = (tn+fp)*tn_weight
        #print('max_score_negatives', max_score_negatives)
        max_score_positives = (tp+fn)*tp_weight
        #print('max_score_positives', max_score_positives)
        score_negatives = tn*tn_weight + fp*fp_weight
        #print('score_negatives', score_negatives)
        score_positives = fn*fn_weight + tp*tp_weight
        #print('score_positives', score_positives)

        if normalized:
            if max_score_negatives != 0:
                normalized_negatives = score_negatives / max_score_negatives
                #print('normalized_negatives', normalized_negatives)
            else:
                normalized_negatives = score_negatives / (max_score_negatives+0.00001)

            if max_score_positives != 0:
                normalized_positives = score_positives / max_score_positives
                #print('normalized_positives', normalized_positives)
            else:
                normalized_positives = score_positives / (max_score_positives+0.00001)

            return (normalized_negatives + normalized_positives) / 2
            #print('balanced_normalized_score', balanced_normalized_score)

        else:

            return (score_negatives + score_positives) / 2

if __name__ == "__main__":


    from libraries.db_handlers.OebbMongodbHandler import OebbMongodbHandler
    mongodb_handler = OebbMongodbHandler()
    data_explorer = DataExplorer()

    from_date = str(datetime.datetime(2017, 1, 1))
    to_date = str(datetime.datetime(2018, 1, 1))
    
    find_query = {
                    'process.beginn_der_bearbeitung':{
                        '$gt': from_date, 
                        '$lt': to_date
                    },
                    'process.schrott':{
                        '$exists': True
                        }
                }
    data = mongodb_handler.query_data_and_generate_dataframe('process_instances', find_query=find_query, return_values={'radsatznummer':1, 'process.schrott':1, 'process.stationsnummer':1, 'final_state.ist_schrott':1, '_id':0}, index='radsatznummer')
    stations = [421, 110, 130, 140, 680, 410, 510, 520, 320, 480, 490, 595, 535, None]
    #stations = [130, 520, 140, 535, 410, 421, 680, 320, 595, 480, 490, 110, 510, None]
    result_dict = {}
    for station in stations:
        result_dict.update(data_explorer.get_total_prediction_results(data, 0.5, station))#limit/10)
    
    result_df = pd.DataFrame.from_dict(result_dict, orient='index')
    
    mongodb_handler.insert_data_into_collection(result_dict, 'prediction_scores')

    result_df.index.name = 'station'
    print(result_df)

    result_df.to_excel(os.path.join('.', 'documentation', 'prediction_scores.xlsx'))