DataExplorer.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import sys
  4. import os
  5. import time
  6. from pprint import pprint
  7. import pandas as pd
  8. import numpy as np
  9. import datetime
  10. sys.path.append(os.getcwd())
  11. from copy import deepcopy
  12. from cdplib.log import Log
  13. from cdplib.FlattenData import FlattenData
  14. from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
  15. class DataExplorer:
  16. def __init__(self):
  17. self._log = Log("Data Explorer")
  18. def calculate_correlation(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False):
  19. no_nan_data = data.fillna(0)
  20. correlations = []
  21. pearson_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict, method='pearson')
  22. self._log.info('Calculated Pearson correlation')
  23. if verbose:
  24. kendall_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict, method='kendall')
  25. correlations.append(kendall_correlation)
  26. self._log.info('Calculated Kendall correlation')
  27. spearman_correlation = self.calculate_big_matrix_correlation(data=data, column_name_to_predict=column_name_to_predict, method='spearman')
  28. correlations.append(spearman_correlation)
  29. self._log.info('Calculated Spearman correlation')
  30. cosine_similarity = self.calculate_cosine_similarity_for_dataframe_columns(no_nan_data, column_name_to_predict)
  31. correlations.append(cosine_similarity)
  32. self._log.info('Calculated cosine similarity')
  33. chi2_independence = self.calculate_chi2_independence_for_dataframe_columns(data, column_name_to_predict, verbose)
  34. correlations.append(chi2_independence)
  35. self._log.info('Calculated chi2 independence')
  36. fisher_exact_test = self.calculate_fisher_exact_for_dataframe_column(data, column_name_to_predict)
  37. correlations.append(fisher_exact_test)
  38. self._log.info('Calculated Fisher Exact Test')
  39. gert_correlation = self.calculate_feature_and_schrott_occurunces(data, column_name_to_predict, prediction_name, verbose)
  40. correlations.append(gert_correlation)
  41. self._log.info('Calculated Gert correlation')
  42. merged_data = pearson_correlation
  43. for correlation in correlations:
  44. if len(correlation.index) > 0:
  45. merged_data = merged_data.join(correlation, how='left')
  46. return merged_data
  47. def calculate_big_matrix_correlation(self, data: pd.DataFrame, column_name_to_predict: str, method: str='pearson') -> pd.DataFrame():
  48. num_columns = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']
  49. result_data = {}
  50. for column in data.columns:
  51. if data[column].dtype in num_columns:
  52. result_data[column] = data[column_name_to_predict].corr(data[column], method=method)
  53. result_df = pd.DataFrame.from_dict(result_data, orient='index')
  54. label_string = method + ' correlation'
  55. result_df.columns = [label_string]
  56. return result_df
  57. def calculate_feature_and_schrott_occurunces(self, data: pd.DataFrame, column_name_to_predict: str, prediction_name: str, verbose: bool = False) -> pd.DataFrame():
  58. result_dict = {}
  59. len_data = len(data.index)
  60. total_schrott = len(data[data[column_name_to_predict] == 1].index)
  61. counter = 0
  62. for column in data.columns:
  63. occurunces = 0
  64. schrott_occurunces = 0
  65. temp_data = data[data[column] == 1]
  66. #print('Temp Data:', temp_data)
  67. if len(temp_data.index) > 0:
  68. occurunces = len(temp_data.index)
  69. schrott_occurunces = len(temp_data[temp_data[column_name_to_predict] == 1].index)
  70. schrott_wheelsets = list(temp_data[temp_data[column_name_to_predict] == 1].index)
  71. non_schrott = occurunces - schrott_occurunces
  72. non_occurunces = len_data - occurunces
  73. percentage_of_occurances_schrott = round((schrott_occurunces/occurunces)*100, 2)
  74. percentage_of_occurances_not_schrott = round((non_schrott/occurunces)*100, 2)
  75. if verbose:
  76. result_dict[column] = {
  77. 'Occurs':occurunces,
  78. '%_Occurs': occurunces/len_data,
  79. prediction_name + '_Occurs':schrott_occurunces,
  80. '%_' + prediction_name + '_Occurs': percentage_of_occurances_schrott,
  81. '%_Total_' + prediction_name: round((schrott_occurunces/total_schrott)*100, 2),
  82. '!Occurs': non_occurunces,
  83. '%_!Occurs': round((non_occurunces/len_data)*100, 2),
  84. 'Occurs_!' + prediction_name: non_schrott,
  85. '%_Occurs_!' + prediction_name: percentage_of_occurances_not_schrott,
  86. '%_Total_!' + prediction_name: round((non_schrott/len_data)*100, 2),
  87. 'Wheelsets_!' + prediction_name: [wheelset for wheelset in list(temp_data.index) if wheelset not in schrott_wheelsets],
  88. 'Wheelsets_' + prediction_name: schrott_wheelsets
  89. }
  90. else:
  91. result_dict[column] = {
  92. 'Occurs':occurunces,
  93. '%_Occurs': occurunces/len_data,
  94. prediction_name + '_Occurs':schrott_occurunces,
  95. '%_' + prediction_name + '_Occurs': percentage_of_occurances_schrott,
  96. '%_Total_' + prediction_name: round((schrott_occurunces/total_schrott)*100, 2),
  97. }
  98. counter+=1
  99. if counter % 100 == 0:
  100. print(('Calculated {} / {}').format(counter, len(data.columns)))
  101. return pd.DataFrame.from_dict(result_dict, orient='index')
  102. def calculate_fisher_exact_for_dataframe_column(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
  103. from scipy.stats import fisher_exact
  104. result_dict = {}
  105. predict_count = data[column_name_to_predict]
  106. for column in data.columns:
  107. fisher_data = pd.crosstab(predict_count, data[column])
  108. if fisher_data.shape == (2, 2):
  109. oddsratio, pvalue = fisher_exact(fisher_data)
  110. result_dict[column] = {
  111. 'Fisher_Exact_pvalue': pvalue,
  112. 'Fisher_Exact_oddsratio': oddsratio
  113. }
  114. else:
  115. result_dict[column] = {
  116. 'Fisher_Exact_pvalue': None,
  117. 'Fisher_Exact_oddsratio': None
  118. }
  119. return pd.DataFrame.from_dict(result_dict, orient='index')
  120. def calculate_cosine_similarity_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str) -> pd.DataFrame():
  121. from sklearn.metrics.pairwise import cosine_similarity
  122. cosine_corr = cosine_similarity(data.transpose())
  123. cosine_df = pd.DataFrame(cosine_corr, columns=data.columns, index=data.columns)
  124. cosine_df = pd.DataFrame(cosine_df[column_name_to_predict])
  125. cosine_df.columns = ['Cosine Similarity']
  126. return cosine_df
  127. def calculate_chi2_independence_for_dataframe_columns(self, data: pd.DataFrame, column_name_to_predict: str, verbose: bool = False) -> pd.DataFrame():
  128. from scipy.stats import chi2_contingency
  129. result_dict = {}
  130. Y = data[column_name_to_predict]
  131. for column in data.columns:
  132. X = data[column]
  133. observed = pd.crosstab(Y,X)
  134. chi2=None
  135. p=None
  136. dof=None
  137. expected=None
  138. if observed.size == 4:
  139. chi2, p, dof, expected = chi2_contingency(observed.values)
  140. if dof > 1:
  141. self._log.warning(('Calculation contained DOF > 1, this is the data:\n {}').format(X))
  142. if verbose:
  143. result_dict[column] = {
  144. 'Chi2': chi2,
  145. 'Chi2, p':p,
  146. 'Chi2, dof':dof,
  147. 'Chi2, expected':expected
  148. }
  149. else:
  150. result_dict[column] = {
  151. 'Chi2, p':p,
  152. }
  153. else:
  154. if verbose:
  155. result_dict[column] = {
  156. 'Chi2': None,
  157. 'Chi2, p':None,
  158. 'Chi2, dof':None,
  159. 'Chi2, expected':None
  160. }
  161. else:
  162. result_dict[column] = {
  163. 'Chi2, p':None,
  164. }
  165. return pd.DataFrame.from_dict(result_dict, orient='index')
  166. def calculate_machine_learning_scores(self, predictions, true_values, y_train, to_date:str = None):
  167. '''
  168. Calculated several different measures on the predictions compared to the true values, returns 2 DataFrames, one to be submitted to Mongodb and one to be saved to excel.
  169. :param predictions: The machine learning predictions
  170. :param true_values: The correct predictions for the dataset
  171. :param y_train: The correct predictions for the training data.
  172. '''
  173. from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, balanced_accuracy_score
  174. if to_date is None:
  175. to_date = str(datetime.datetime.utcnow())
  176. confusion_matrix = confusion_matrix(true_values, predictions)
  177. #normalized_confusion_matrix = confusion_matrix(true_values, predictions, normalize=True)
  178. normalized_confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
  179. classification_report = classification_report(true_values, predictions)
  180. accuracy_score = round(accuracy_score(true_values, predictions)*100, 2)
  181. roc_auc_score = round(roc_auc_score(true_values, predictions)*100,2)
  182. balanced_custom_cost_score = self.balanced_custom_cost_function(true_values, predictions)
  183. custom_cost_score = self.custom_cost_function(true_values, predictions)
  184. #roc_auc_score = 'Unknown'
  185. balanced_accuracy = round(balanced_accuracy_score(true_values, predictions)*100, 2)
  186. print("Timestamp\n", to_date)
  187. print('\n')
  188. print(('% True Positives = {}').format(confusion_matrix[1][1]/(confusion_matrix[1][1]+confusion_matrix[1][0])))
  189. print(('% False Positives = {}').format(confusion_matrix[0][1]/(confusion_matrix[0][1]+confusion_matrix[0][0])))
  190. print('\n')
  191. print('Accuracy:\n', accuracy_score)
  192. print('\n')
  193. print('Balanced Accuracy:\n', balanced_accuracy)
  194. print('\n')
  195. print('ROC AUC Score:\n', roc_auc_score)
  196. print('\n')
  197. print('Balanced Custom Cost Function:\n', balanced_custom_cost_score)
  198. print('\n')
  199. print('Custom Cost Function:\n', custom_cost_score)
  200. print('\n')
  201. print('Confusion Matrix:\n', confusion_matrix)
  202. print('\n')
  203. print('Normalized Confusion Matrix:\n', normalized_confusion_matrix)
  204. print('\n')
  205. print('Classification Report:\n', classification_report)
  206. result_dict = {
  207. 'Size of training set:': len(y_train),
  208. 'Number of Schrott in training set': y_train.value_counts().loc[True],
  209. 'Size of prediction set:': len(true_values),
  210. 'Number of Schrott in prediction set': true_values.value_counts().loc[True],
  211. 'Confusion Matrix': confusion_matrix,
  212. 'Normalized Confusion Matrix': normalized_confusion_matrix,
  213. 'Accuracy': accuracy_score,
  214. 'Balanced Accuracy': balanced_accuracy,
  215. 'ROC AUC Score:': roc_auc_score,
  216. 'Balanced Custom Cost Score:': balanced_custom_cost_score,
  217. 'Custom Cost Score:': custom_cost_score
  218. }
  219. confusion_matrix = confusion_matrix.ravel()
  220. database_dict = {
  221. "timestamp": to_date,
  222. "result": {
  223. "training_set":{
  224. "amount": int(len(y_train)),
  225. "schrott": int(y_train.value_counts().loc[True])
  226. },
  227. "test_set":{
  228. "amount": int(len(true_values)),
  229. "schrott": int(true_values.value_counts().loc[True])
  230. },
  231. "confusion_matrix":[
  232. ("True Negatives", int(confusion_matrix[0])),
  233. ("False Positives", int(confusion_matrix[1])),
  234. ("False Negatives", int(confusion_matrix[2])),
  235. ("True Positives", int(confusion_matrix[3]))
  236. ],
  237. "normalized_confusion_matrix":[
  238. ("True Negatives", round(normalized_confusion_matrix[0][0]*100, 2)),
  239. ("False Positives", round(normalized_confusion_matrix[0][1]*100, 2)),
  240. ("False Negatives", round(normalized_confusion_matrix[1][0]*100, 2)),
  241. ("True Positives", round(normalized_confusion_matrix[1][1]*100, 2))
  242. ],
  243. "accuracy": accuracy_score,
  244. "balanced_accuracy": balanced_accuracy,
  245. "ROC_AUC_score": roc_auc_score,
  246. 'balanced_custom_cost_score': int(balanced_custom_cost_score),
  247. 'custom_cost_score': int(custom_cost_score)
  248. }
  249. }
  250. pprint(database_dict)
  251. #result_dict={}
  252. #database_dict={}
  253. return result_dict, database_dict
  254. def get_total_prediction_results(self, in_data, limit: float = 0.5, station = None):
  255. data = deepcopy(in_data)
  256. data_flattener = FlattenData()
  257. data['ist_schrott'] = data_flattener.flatten(data['final_state'])
  258. predictions_lists = []
  259. stations_lists = []
  260. num_correct_predictions = []
  261. num_wrong_predictions = []
  262. for true_value, predictions in zip(data['ist_schrott'], data['process']):
  263. predictions_list, stations_list= self.filter_by_threshold_and_station(predictions, limit, station)
  264. predictions_lists.append(predictions_list)
  265. stations_lists.append(stations_list)
  266. num_true = sum(predictions_list)
  267. num_false = len(predictions_list) - sum(predictions_list)
  268. if true_value:
  269. num_correct_predictions.append(num_true)
  270. num_wrong_predictions.append(num_false)
  271. else:
  272. num_correct_predictions.append(num_false)
  273. num_wrong_predictions.append(num_true)
  274. # Fill DataFrame
  275. data['stations'] = stations_lists
  276. data['predictions'] = predictions_lists
  277. data['num_predictions'] = data['predictions'].str.len()
  278. data['num_correct_predictions'] = num_correct_predictions
  279. data['%_correct_predictions'] = round(data['num_correct_predictions'] / data['num_predictions'], 3)*100
  280. data['num_wrong_predictions'] = num_wrong_predictions
  281. data['%_wrong_predictions'] = round(data['num_wrong_predictions'] / data['num_predictions'], 3)*100
  282. '''
  283. print('LIMIT:', limit)
  284. num_wheelsets = len(data.index)
  285. # Print results
  286. print('Predictions for station:', station if not None else 'All stations')
  287. print('Wheelsets predicted:', num_wheelsets)
  288. print('Predictions made:', data['num_predictions'].sum())
  289. '''
  290. return_dict = {}
  291. if station is None:
  292. station = 'All Stations'
  293. return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
  294. return_dict[station+' per Wheelset'] = {'result': self.calculate_statistics_per_wheelset(data)}
  295. else:
  296. return_dict[str(station)] = {'result': self.calculate_statistics_for_all_predictions(data) }
  297. data.drop(['final_state', 'process'], axis=1, inplace=True)
  298. return return_dict
  299. def filter_by_threshold_and_station(self, schrott_list: list, limit: float = 0.5, station=None):
  300. prediction_list = []
  301. station_list = []
  302. for value in schrott_list:
  303. if value and 'schrott' in value.keys():
  304. if station is None:
  305. station_list.append(value['stationsnummer'])
  306. prediction_list.append(value['schrott'] > limit)
  307. elif value['stationsnummer'] == station:
  308. station_list.append(value['stationsnummer'])
  309. prediction_list.append(value['schrott'] > limit)
  310. return (prediction_list, station_list)
  311. def calculate_statistics_for_all_predictions(self, data: pd.DataFrame):
  312. correct_predictions = data['num_correct_predictions'].sum()
  313. wrong_predictions = data['num_wrong_predictions'].sum()
  314. true_positives =data[data['ist_schrott']==True]['num_correct_predictions'].sum()
  315. false_negatives = data[data['ist_schrott']==True]['num_wrong_predictions'].sum()
  316. true_negatives = data[data['ist_schrott']==False]['num_correct_predictions'].sum()
  317. false_positives = data[data['ist_schrott']==False]['num_wrong_predictions'].sum()
  318. confusion_matrix = [true_negatives, false_positives, false_negatives, true_positives]
  319. if (true_negatives + false_positives) != 0 and (false_negatives + true_positives) != 0:
  320. normalized_confusion_matrix = [ round(true_negatives/(true_negatives + false_positives),3),\
  321. round(false_positives/(true_negatives + false_positives),3),\
  322. round(false_negatives/(false_negatives + true_positives),3),\
  323. round(true_positives/(false_negatives + true_positives),3)]
  324. else:
  325. normalized_confusion_matrix = 0
  326. accuracy = round(data['num_correct_predictions'].sum()/(data['num_predictions'].sum()), 3) *100
  327. balanced_accuracy = round( (normalized_confusion_matrix[0]+ normalized_confusion_matrix[-1])/2, 2) *100
  328. '''
  329. print('\n', '*-*'*20)
  330. print('Statistics for all predictions')
  331. print('Correct predictions:', correct_predictions)
  332. print('Wrong predictions:', correct_predictions)
  333. print('Confusion Matrix:', confusion_matrix)
  334. print('Normalized Confusion Matrix:', normalized_confusion_matrix)
  335. print('Accuracy:', accuracy, '%')
  336. print('Balanced Accuracy:', balanced_accuracy, '%')
  337. '''
  338. return_dict = {
  339. 'correct_predictions': str(correct_predictions),
  340. 'wrong_predictions': str(wrong_predictions),
  341. 'confusion_matrix': [
  342. ('True Negatives', int(confusion_matrix[0])),
  343. ('False Positives', int(confusion_matrix[1])),
  344. ('False Negatives', int(confusion_matrix[2])),
  345. ('True Positives', int(confusion_matrix[3]))
  346. ],
  347. 'normalized_confusion_matrix': [
  348. ('True Negatives', round(normalized_confusion_matrix[0]*100,2)),
  349. ('False Positives', round(normalized_confusion_matrix[1]*100,2)),
  350. ('False Negatives', round(normalized_confusion_matrix[2]*100,2)),
  351. ('True Positives', round(normalized_confusion_matrix[3]*100,2))
  352. ],
  353. 'accuracy': accuracy,
  354. 'balanced_accuracy': balanced_accuracy
  355. }
  356. return return_dict
  357. def calculate_statistics_per_wheelset(self, data: pd.DataFrame):
  358. num_wheelsets = len(data.index)
  359. schrott_data = data[data['ist_schrott'] == True]
  360. not_schrott_data = data[data['ist_schrott'] == False]
  361. wheelset_true_positives = len(schrott_data[schrott_data['num_correct_predictions']>0].index)
  362. wheelset_false_positives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']>0].index)
  363. wheelset_true_negatives = len(not_schrott_data[not_schrott_data['num_wrong_predictions']==0].index)
  364. wheelset_false_negatives = len(schrott_data[schrott_data['num_correct_predictions']==0].index)
  365. wheelset_correct = wheelset_true_positives + wheelset_true_negatives
  366. wheelset_wrong = wheelset_false_positives + wheelset_false_negatives
  367. wheelset_confusion_matrix = [wheelset_true_negatives, wheelset_false_positives, wheelset_false_negatives, wheelset_true_positives]
  368. if (wheelset_true_negatives + wheelset_false_positives) != 0 and (wheelset_false_negatives + wheelset_true_positives) != 0:
  369. wheelset_normalized_confusion_matrix = [round(wheelset_true_negatives/(wheelset_true_negatives + wheelset_false_positives),3),\
  370. round(wheelset_false_positives/(wheelset_true_negatives + wheelset_false_positives),3),\
  371. round(wheelset_false_negatives/(wheelset_false_negatives + wheelset_true_positives),3),\
  372. round(wheelset_true_positives/(wheelset_false_negatives + wheelset_true_positives),3)]
  373. else:
  374. wheelset_normalized_confusion_matrix = 0
  375. wheelset_accuracy = round(wheelset_correct/num_wheelsets, 3) *100
  376. if type(wheelset_normalized_confusion_matrix) == list:
  377. wheelset_balanced_accuracy = round( (wheelset_normalized_confusion_matrix[0]+ wheelset_normalized_confusion_matrix[-1])/2, 2) *100
  378. else:
  379. wheelset_balanced_accuracy = 'could not be calculates'
  380. print('\n', '*-*'*20)
  381. print('Statistics per wheelset')
  382. print('Correct predictions:', wheelset_correct)
  383. print('Wrong predictions:', wheelset_wrong)
  384. print('Confusion Matrix:', [['True Negatives', str(wheelset_confusion_matrix[0])], ['False Positives', str(wheelset_confusion_matrix[1])],\
  385. ['False Negatives', str(wheelset_confusion_matrix[2])], ['True Positives', str(wheelset_confusion_matrix[3])]])
  386. print('Normalized Confusion Matrix:', [['True Negatives', str(wheelset_normalized_confusion_matrix[0])], ['False Positives', str(wheelset_normalized_confusion_matrix[1])],\
  387. ['False Negatives', str(wheelset_normalized_confusion_matrix[2])], ['True Positives', str(wheelset_normalized_confusion_matrix[3])]])
  388. print('Accuracy:', wheelset_accuracy, '%')
  389. print('Balanced Accuracy:', wheelset_balanced_accuracy, '%')
  390. return_dict = {
  391. 'correct_predictions': str(wheelset_correct),
  392. 'wrong_predictions': str(wheelset_wrong),
  393. 'confusion_matrix':[
  394. ('True Negatives', int(wheelset_confusion_matrix[0])),
  395. ('False Positives', int(wheelset_confusion_matrix[1])),
  396. ('False Negatives', int(wheelset_confusion_matrix[2])),
  397. ('True Positives', int(wheelset_confusion_matrix[3]))
  398. ],
  399. 'normalized_confusion_matrix': [
  400. ('True Negatives', round(wheelset_normalized_confusion_matrix[0]*100, 2)),
  401. ('False Positives', round(wheelset_normalized_confusion_matrix[1]*100, 2)),
  402. ('False Negatives', round(wheelset_normalized_confusion_matrix[2]*100, 2)),
  403. ('True Positives', round(wheelset_normalized_confusion_matrix[3]*100, 2))
  404. ],
  405. 'accuracy': wheelset_accuracy,
  406. 'balanced_accuracy': wheelset_balanced_accuracy
  407. }
  408. return return_dict
  409. # Cost for new wheelset - 2000€
  410. # Average cost for overhauling - ~3h = €250
  411. # True positive: 250
  412. # False positive: -2000
  413. # True negative: 0
  414. # False negative: -250
  415. def custom_cost_function(self, correct_values, predictions, weights_list: list = [0, -2000, 0, 250], normalized: bool = False):
  416. from sklearn.metrics import confusion_matrix
  417. tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
  418. tn_weight, fp_weight, fn_weight, tp_weight = weights_list
  419. max_score = (tn+fp)*tn_weight+(tp+fn)*tp_weight
  420. score = tn*tn_weight + fp*fp_weight + fn*fn_weight + tp*tp_weight
  421. if normalized:
  422. return score/max_score
  423. else:
  424. return score
  425. # old weights_list: list = [1.2, -500, -0.2, 2]
  426. def balanced_custom_cost_function(self, correct_values, predictions, weights_list: list = [0, -2000, 0, 250], normalized: bool = False):
  427. from sklearn.metrics import confusion_matrix
  428. #print(confusion_matrix(correct_values, predictions))
  429. tn, fp, fn, tp = confusion_matrix(correct_values, predictions).ravel()
  430. tn_weight, fp_weight, fn_weight, tp_weight = weights_list
  431. #print(('tn {}, fp {}, fn {}, tp {}').format(tn, fp, fn, tp))
  432. max_score_negatives = (tn+fp)*tn_weight
  433. #print('max_score_negatives', max_score_negatives)
  434. max_score_positives = (tp+fn)*tp_weight
  435. #print('max_score_positives', max_score_positives)
  436. score_negatives = tn*tn_weight + fp*fp_weight
  437. #print('score_negatives', score_negatives)
  438. score_positives = fn*fn_weight + tp*tp_weight
  439. #print('score_positives', score_positives)
  440. if normalized:
  441. if max_score_negatives != 0:
  442. normalized_negatives = score_negatives / max_score_negatives
  443. #print('normalized_negatives', normalized_negatives)
  444. else:
  445. normalized_negatives = score_negatives / (max_score_negatives+0.00001)
  446. if max_score_positives != 0:
  447. normalized_positives = score_positives / max_score_positives
  448. #print('normalized_positives', normalized_positives)
  449. else:
  450. normalized_positives = score_positives / (max_score_positives+0.00001)
  451. return (normalized_negatives + normalized_positives) / 2
  452. #print('balanced_normalized_score', balanced_normalized_score)
  453. else:
  454. return (score_negatives + score_positives) / 2
  455. if __name__ == "__main__":
  456. from libraries.db_handlers.OebbMongodbHandler import OebbMongodbHandler
  457. mongodb_handler = OebbMongodbHandler()
  458. data_explorer = DataExplorer()
  459. from_date = str(datetime.datetime(2017, 1, 1))
  460. to_date = str(datetime.datetime(2018, 1, 1))
  461. find_query = {
  462. 'process.beginn_der_bearbeitung':{
  463. '$gt': from_date,
  464. '$lt': to_date
  465. },
  466. 'process.schrott':{
  467. '$exists': True
  468. }
  469. }
  470. data = mongodb_handler.query_data_and_generate_dataframe('process_instances', find_query=find_query, return_values={'radsatznummer':1, 'process.schrott':1, 'process.stationsnummer':1, 'final_state.ist_schrott':1, '_id':0}, index='radsatznummer')
  471. stations = [421, 110, 130, 140, 680, 410, 510, 520, 320, 480, 490, 595, 535, None]
  472. #stations = [130, 520, 140, 535, 410, 421, 680, 320, 595, 480, 490, 110, 510, None]
  473. result_dict = {}
  474. for station in stations:
  475. result_dict.update(data_explorer.get_total_prediction_results(data, 0.5, station))#limit/10)
  476. result_df = pd.DataFrame.from_dict(result_dict, orient='index')
  477. mongodb_handler.insert_data_into_collection(result_dict, 'prediction_scores')
  478. result_df.index.name = 'station'
  479. print(result_df)
  480. result_df.to_excel(os.path.join('.', 'documentation', 'prediction_scores.xlsx'))