algoritm_knn.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. import subprocess
  2. import PyPDF2
  3. import numpy as np
  4. #import csv
  5. import order_bounding_boxes_in_each_block
  6. import read_from_clustered_merged
  7. import get_distances
  8. import clustering_precomputed_dbscan_noParallels as dbscan
  9. with open('/home/bscheibel/PycharmProjects/clustering/config.txt', 'r') as myfile:
  10. config_path = myfile.read()
  11. print("Path: ", config_path)
  12. def calculate_inner_distance(result):
  13. min_size_x,max_size_x, min_size_y, max_size_y, diagonal = get_distances.size_blocks(result)
  14. #print("inner distance: ", diagonal)
  15. def find_nearest_above(my_array, target):
  16. diff = my_array - target
  17. mask = np.ma.less_equal(diff, 0)
  18. # We need to mask the negative differences and zero
  19. # since we are looking for values above
  20. if np.all(mask):
  21. return None # returns None if target is greater than any value
  22. masked_diff = np.ma.masked_array(diff, mask)
  23. return masked_diff.argmin()
  24. def avg_words_block_clustered(words, cluster):
  25. blocks = cluster
  26. avg_words = words/blocks
  27. return avg_words
  28. def convert_pdf_img(filename):
  29. subprocess.call(['pdftoppm', '-jpeg', '-singlefile',
  30. filename, config_path + '/temporary/out'])
  31. def read_pdf(filename):
  32. pdf = PyPDF2.PdfFileReader(filename, strict=False)
  33. p = pdf.getPage(0)
  34. w = p.mediaBox.getWidth()
  35. h = p.mediaBox.getHeight()
  36. orientation = p.get('/Rotate')
  37. return w, h, orientation
  38. def read_webpage(filename):
  39. return "test"
  40. def get_min_nn(result, path):
  41. dm = get_distances.distance_btw_blocks(result, path)
  42. knn = get_distances.distance_knn(dm)
  43. knn = list(set(knn))
  44. knn = sorted(knn)
  45. return knn
  46. def show_boxes(filepath, clean_arrays, eps):
  47. img_path = config_path + '/temporary/out.jpg'
  48. w, h, orientation = read_pdf(filepath)
  49. convert_pdf_img(filepath)
  50. filename = filepath.split("/")[-1]
  51. filename = filename.split(".pdf")[0]
  52. read_from_clustered_merged.highlight_image(clean_arrays, img_path, w, h, orientation, eps, filename)
  53. return filename
  54. def main(uuid=123, filepath=config_path+"/"+ "drawings/Stahl_Adapterplatte.PDF"):
  55. path = config_path
  56. filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
  57. result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename) #get coordinates+text out of html file into array of arrays
  58. print("number_blocks:", number_blocks)
  59. print("number_words:", number_words)
  60. avg_words_block = number_words / number_blocks
  61. print("avg words/blocks", avg_words_block)
  62. result_df = get_distances.get_average_xy(result, path)
  63. # get min_nn
  64. knn = get_min_nn(result_df, path)
  65. #print("knn: ", knn)
  66. eps = min(knn)
  67. print("min_knn: ", eps)
  68. #try one clustering iteration, with eps=smallest value
  69. res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
  70. # res = res.drop(res.columns[[0, 1]], axis=1).to_csv("test.csv", header=False)
  71. # #res = res.reset_index().tolist()
  72. # with open(path+"/test.csv") as csvfile:
  73. # readCSV = csv.reader(csvfile, delimiter=';')
  74. # res = list(readCSV)
  75. # print(res)
  76. #get_distances.get_average_xy(res, path)
  77. res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
  78. #read default value
  79. clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
  80. show_boxes(filepath, clean_arrays, "default")
  81. #print(number_clusters)
  82. clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  83. # show results
  84. show_boxes(filepath, clean_arrays, eps)
  85. #look at stop criterion
  86. avg_words_block = avg_words_block_clustered(number_words, number_clusters)
  87. print("a/w first clustering eps=1: ", avg_words_block)
  88. #cluster as long as stop criterion is not met
  89. print("cluster, eps: ", eps)
  90. chs = chs_old
  91. while(1): # this condition has to be changed to the breaking condition
  92. print("cluster, eps: ", eps)
  93. dbs_old = dbs
  94. chs_old = chs
  95. silhoutte_old = silhoutte
  96. res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
  97. # stop criterion, has to be established (silhoutte, davis-bouldin, c...?), or combination of these three
  98. avg_words_block_new = avg_words_block_clustered(number_words, number_clusters)
  99. print("avg_words_blocks:", avg_words_block_new)
  100. #stop_criterion = avg_words_block_new-avg_words_block
  101. read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  102. # show results
  103. #show_boxes(filepath, clean_arrays, eps)
  104. print(dbs <= dbs_old)
  105. print(chs >= chs_old)
  106. print(silhoutte >= silhoutte_old)
  107. old_eps = eps
  108. #block to see which conditions apply first, mostly silhoutte and dbs
  109. if not dbs<=dbs_old and avg_words_block_new>avg_words_block:
  110. print("stopping threshold reached dbs")
  111. clean_arrays = read_from_clustered_merged.read(
  112. path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  113. show_boxes(filepath, clean_arrays, "threshold dbs")
  114. if not chs>=chs_old and avg_words_block_new>avg_words_block:
  115. print("stopping threshold reached chs")
  116. clean_arrays = read_from_clustered_merged.read(
  117. path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  118. show_boxes(filepath, clean_arrays, "threshold chs")
  119. if not silhoutte>=silhoutte_old and avg_words_block_new > avg_words_block:
  120. print("stopping threshold reached silhoutte")
  121. clean_arrays = read_from_clustered_merged.read(
  122. path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  123. show_boxes(filepath, clean_arrays, "threshold silhoutte")
  124. # and/or, or does not cluster too much, but can also be not enough
  125. if (not dbs <= dbs_old or not chs >= chs_old or not silhoutte >= silhoutte_old) and avg_words_block_new > avg_words_block:
  126. print("stopping threshold reached")
  127. break
  128. try:
  129. eps = find_nearest_above(knn, eps)
  130. eps = knn[eps]
  131. except:
  132. print("highest nn value reached")
  133. break
  134. res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, old_eps, path)
  135. print("Last EPS: ", old_eps)
  136. print("Last W/B: ", avg_words_block)
  137. clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
  138. show_boxes(filepath, clean_arrays, "default")
  139. clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  140. #show results
  141. show_boxes(filepath, clean_arrays, eps)
  142. if __name__ == "__main__":
  143. main()