get_distances.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. from math import sqrt
  2. import numpy as np
  3. import pandas
  4. import csv
  5. #import math
  6. from clustering_precomputed_dbscan_noParallels import intersects
  7. from scipy import stats
  8. def get_average_xy(list_input, path):
  9. csv_name = path+"/temporary/list_to_csv_with_corner_points_distances.csv"
  10. resultFile = open(csv_name, 'w')
  11. wr = csv.writer(resultFile, delimiter=";")
  12. wr.writerow(["element","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
  13. result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
  14. #result_df = pandas.DataFrame(columns=["xmin","ymin","xmax","ymax"])
  15. for element in list_input:
  16. xavg_elem = 0
  17. yavg_elem = 0
  18. ymin = 100000000
  19. ymax = 0
  20. xmin = 100000000
  21. xmax = 0
  22. newList = []
  23. check = False
  24. if len(element) == 5 and not isinstance(element[0], list):
  25. newList.append(element)
  26. element = newList
  27. for blub in element: #get the smallest and largest x and y value for whole block
  28. if isinstance(blub[0],list) and len(blub[0])==5:
  29. blub = blub [0]
  30. if float(blub[1]) < ymin:
  31. ymin = float(blub[1])
  32. #print("y_min:",y_min)
  33. if float(blub[0]) < xmin:
  34. xmin = float(blub[0])
  35. if float(blub[3]) > ymax:
  36. ymax = float(blub[3])
  37. if float(blub[2]) > xmax:
  38. xmax = float(blub[2])
  39. point_xmi_ymi = [xmin,ymin]
  40. point_xma_ymi = [xmax,ymin]
  41. point_xmi_yma = [xmin,ymax]
  42. point_xma_yma = [xmax,ymax]
  43. wr.writerow([element, point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma])
  44. result_df.loc[len(result_df)] = [ point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma]
  45. resultFile.close()
  46. #result_df.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
  47. return result_df
  48. def dist(rectangle1, rectangle2):
  49. #get minimal distance between two rectangles
  50. distance = 100000000
  51. #print(rectangle2,rectangle1)
  52. for point1 in rectangle1[:4]:
  53. #print(point1)
  54. point1 = eval(point1) #necessary to convert [] to real tuple
  55. for point2 in rectangle2[:4]:
  56. #print(point2)
  57. point2 = eval(point2)
  58. dist = sqrt(((float(point2[0]) - float(point1[0])))**2 + ((float(point2[1]) - float(point1[1])))**2)
  59. if dist < distance:
  60. distance = dist
  61. if intersects(rectangle1, rectangle2):
  62. distance = 0
  63. return distance
  64. def size_blocks(list_input): #x, y distance of blocks (not regarding words)
  65. #print(list_input)
  66. min_size_x = 1000000000
  67. max_size_x = 0
  68. min_size_y = 1000000000
  69. max_size_y = 0
  70. x_size = []
  71. y_size = []
  72. diagonal = []
  73. for element in list_input:
  74. newList = []
  75. ymin = 1000000000
  76. ymax = 0
  77. xmin = 1000000000
  78. xmax = 0
  79. if len(element) == 5 and not isinstance(element[0], list):
  80. newList.append(element)
  81. element = newList
  82. for blub in element: # get the smallest and largest x and y value for whole block, block sizes
  83. print(blub)
  84. if isinstance(blub[0], list) and len(blub[0]) == 5:
  85. blub = blub[0]
  86. if float(blub[1]) < ymin:
  87. ymin = float(blub[1])
  88. # print("y_min:",y_min)
  89. if float(blub[0]) < xmin:
  90. xmin = float(blub[0])
  91. if float(blub[3]) > ymax:
  92. ymax = float(blub[3])
  93. if float(blub[2]) > xmax:
  94. xmax = float(blub[2])
  95. distance_x = xmax-xmin
  96. distance_y = ymax-ymin
  97. diagonal_ = math.sqrt(distance_x ** 2 + distance_y ** 2) # satz der pythogoras
  98. diagonal.append(diagonal_)
  99. x_size.append(distance_x)
  100. #print(distance_x, blub[4])
  101. #print(distance_y, blub[4])
  102. y_size.append(distance_y)
  103. if distance_x < min_size_x:
  104. min_size_x = distance_x
  105. if distance_x > max_size_x:
  106. max_size_x = distance_x
  107. if distance_y < min_size_y:
  108. min_size_y = distance_y
  109. if distance_y > max_size_y:
  110. max_size_y = distance_y
  111. x_size = np.array(x_size)
  112. x_size = x_size.round(decimals=0)
  113. #print(x_size)
  114. median_size_x = np.median(x_size)
  115. modus_size_x = stats.mode(x_size)
  116. y_size = np.array(y_size)
  117. y_size = y_size.round(decimals=0)
  118. #print(y_size)
  119. median_size_y = np.median(y_size)
  120. modus_size_y = stats.mode(y_size)
  121. #print(min_size_x,max_size_x, min_size_y, max_size_y)
  122. print("Size_Median_x:", median_size_x)
  123. print("Size_Median_y:", median_size_y)
  124. print("Size_Modus_x:", modus_size_x)
  125. print("Size_Modus_y:", modus_size_y)
  126. return min_size_x,max_size_x, min_size_y, max_size_y, diagonal
  127. def distance_btw_blocks(result, path):
  128. result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None)
  129. with open(path+"/temporary/blub_distances.csv") as csvfile:
  130. readCSV = csv.reader(csvfile, delimiter=';')
  131. result = list(readCSV)
  132. dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result])
  133. dm_flattened = dm.flatten()
  134. dm_flattened = dm_flattened[dm_flattened != 0]
  135. dm_flattened = dm_flattened.round(decimals=0)
  136. #dm_ordered = sorted(dm_flattened)
  137. #print(dm_ordered)
  138. median= np.median(dm_flattened)
  139. mode = stats.mode(dm_flattened)
  140. most_often= pandas.value_counts(dm_flattened)
  141. #x = itemfreq(dm_flattened)
  142. #print(pandas.DataFrame(most_often, columns=["first"]).columns)
  143. #print(x)
  144. print("Distance Mode:", mode)
  145. #print(most_often)
  146. #largest = most_often.nsmallest(15, "first")
  147. #print(largest)
  148. print("Distance Median:", median)
  149. #print(max_dist, min_dist)
  150. return dm
  151. def distance_knn(dm):
  152. knn = []
  153. for row in dm:
  154. row = row[row != 0]
  155. row = row.round(decimals=2)
  156. row = sorted(row)
  157. knn.extend(row[:2])
  158. return knn
  159. def avg_words_block_clustered(words, cluster):
  160. blocks = cluster
  161. avg_words = words/blocks
  162. #print(avg_words)
  163. return avg_words
  164. # def main(uuid, filepath):
  165. # path = "/home/bscheibel/PycharmProjects/clustering"
  166. # filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
  167. # result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename)
  168. # print("number_blocks:", number_blocks)
  169. # print("number_words:", number_words)
  170. # print("avg words/blocks", number_words/number_blocks)
  171. # size_blocks(result)
  172. #
  173. # result_df = get_average_xy(result, path)
  174. #
  175. #
  176. # result = clustering_precomputed_dbscan_og_without.get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays
  177. # #print(result)
  178. # result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
  179. # #with open(path+"/temporary/blub.csv") as csvfile:
  180. # # readCSV = csv.reader(csvfile, delimiter=';')
  181. # # result = list(readCSV)
  182. # dm = distance_btw_blocks(result_df, path)
  183. # knn = distance_knn(dm)
  184. # median= np.median(knn)
  185. # mode = stats.mode(knn)
  186. # min_nn = min(knn)
  187. # avg = np.sum(knn)/len(knn)
  188. # print("min_knn:", min_nn)
  189. # print("knn_mean:", avg)
  190. # print("knn_median:", median)
  191. # print("knn_mode:", mode)
  192. #knn.nearestNeighbors(dm)
  193. #main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/Werkstattzeichnung Zwischenwelle.pdf")