clustering_precomputed_dbscan.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. # coding: utf8
  2. import numpy as np
  3. import pandas
  4. import csv
  5. from math import sqrt
  6. from sklearn.cluster import DBSCAN
  7. from sklearn import metrics
  8. from sklearn.metrics import davies_bouldin_score
  9. import time
  10. def get_average_xy(list_input, path):
  11. csv_name = path+"/temporary/list_to_csv_with_corner_points.csv"
  12. resultFile = open(csv_name, 'w')
  13. wr = csv.writer(resultFile, delimiter=";")
  14. wr.writerow(["element", "xmin","ymin","xmax","ymax", "ausrichtung","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
  15. result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma","ausrichtung"])
  16. for element in list_input:
  17. ymin = 100000000
  18. ymax = 0
  19. xmin = 100000000
  20. xmax = 0
  21. newList = []
  22. if len(element) == 5 and not isinstance(element[0], list):
  23. newList.append(element)
  24. element = newList
  25. for blub in element: #get the smallest and largest x and y value for whole block
  26. if isinstance(blub[0],list) and len(blub[0]) == 5:
  27. blub = blub [0]
  28. if float(blub[1]) < ymin:
  29. ymin = float(blub[1])
  30. if float(blub[0]) < xmin:
  31. xmin = float(blub[0])
  32. if float(blub[3]) > ymax:
  33. ymax = float(blub[3])
  34. if float(blub[2]) > xmax:
  35. xmax = float(blub[2])
  36. if float(xmax)-float(xmin) > 1.3*(float(ymax)-float(ymin)):
  37. ausrichtung = 0 # horizontal
  38. #elif
  39. elif 1.3*(float(xmax)-float(xmin)) < float(ymax)-float(ymin):
  40. ausrichtung = 1 # vertikal
  41. else:
  42. ausrichtung = 3 # sonstiges
  43. ##### GET CORNER POINTS
  44. point_xmi_ymi = [xmin,ymin]
  45. point_xma_ymi = [xmax,ymin]
  46. point_xmi_yma = [xmin,ymax]
  47. point_xma_yma = [xmax,ymax]
  48. wr.writerow([element,xmin,ymin,xmax,ymax, ausrichtung,point_xmi_ymi,point_xma_ymi,point_xmi_yma,point_xma_yma])
  49. result_df.loc[len(result_df)]=[point_xmi_ymi,point_xma_ymi, point_xmi_yma, point_xma_yma,ausrichtung]
  50. resultFile.close()
  51. return result_df
  52. def intersects(rectangle1, rectangle2): #using the separating axis theorem, returns true if they intersect, otherwise false
  53. rect_1_min = eval(rectangle1[0])
  54. rect_1_max = eval(rectangle1[3])
  55. rect1_bottom_left_x = rect_1_min[0]
  56. rect1_top_right_x = rect_1_max[0]
  57. rect1_bottom_left_y = rect_1_max[1]
  58. rect1_top_right_y = rect_1_min[1]
  59. rect_2_min = eval(rectangle2[0])
  60. rect_2_max = eval(rectangle2[3])
  61. rect2_bottom_left_x = rect_2_min[0]
  62. rect2_top_right_x = rect_2_max[0]
  63. rect2_bottom_left_y = rect_2_max[1]
  64. rect2_top_right_y = rect_2_min[1]
  65. return not (rect1_top_right_x < rect2_bottom_left_x or rect1_bottom_left_x > rect2_top_right_x or rect1_top_right_y > rect2_bottom_left_y or rect1_bottom_left_y < rect2_top_right_y)
  66. def get_ausrichtung(rectangle1,rectangle2):
  67. #check if rect 1 and rect 2 are above or beside, r,l, a,b
  68. min_1 = eval(rectangle1[0])
  69. min_2 = eval(rectangle2[0])
  70. diff_y = min_1[1] - min_2[1]
  71. diff_x = min_1[0] - min_2[0]
  72. if diff_x < diff_y:
  73. ausrichtung = "above"
  74. else:
  75. ausrichtung = "side"
  76. return ausrichtung
  77. def get_parallel(rectangle1, rectangle2):
  78. parallel = False
  79. ausrichtung_1 = eval(rectangle1[4])
  80. ausrichtung_2 = eval(rectangle2[4])
  81. if ausrichtung_1 == ausrichtung_2 and ausrichtung_1 == 0:
  82. ausrichtung = get_ausrichtung(rectangle1, rectangle2)
  83. if ausrichtung == "above":
  84. parallel = True
  85. if ausrichtung_1 == ausrichtung_2 and ausrichtung_1 == 1:
  86. ausrichtung = get_ausrichtung(rectangle1, rectangle2)
  87. if ausrichtung == "side":
  88. parallel = True
  89. return parallel
  90. def dist(rectangle1, rectangle2):
  91. #get minimal distance between two rectangles
  92. distance = 100000000
  93. second_dist = 100000
  94. for point1 in rectangle1[:4]:
  95. point1 = eval(point1)
  96. for point2 in rectangle2[:4]:
  97. point2 = eval(point2)
  98. dist = sqrt((float(point2[0]) - float(point1[0]))**2 + ((float(point2[1]) - float(point1[1])))**2)
  99. if dist < distance:
  100. second_dist = distance
  101. distance = dist
  102. if get_parallel(rectangle1,rectangle2):
  103. distance += 1000
  104. second_dist += 1000
  105. if intersects(rectangle1, rectangle2):
  106. distance = 0
  107. second_dist = 0
  108. distance = (distance+second_dist)/2
  109. return distance
  110. def clustering(dm,eps,path):
  111. db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)
  112. labels = db.labels_
  113. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  114. print('Estimated number of clusters: %d' % n_clusters_)
  115. data_df = pandas.read_csv(path +"/temporary/list_to_csv_with_corner_points.csv", sep=";")
  116. data_df["cluster"] = labels
  117. try:
  118. dbs = davies_bouldin_score(dm, labels)
  119. #dbs = "1"
  120. chs = metrics.calinski_harabasz_score(dm, labels)
  121. #chs = 1
  122. silhoutte = metrics.silhouette_score(dm, labels, metric='precomputed')
  123. #silhoutte = 2
  124. print("DBscore: ", dbs)
  125. print("calsinski: ", chs)
  126. print("silhoutte: ", silhoutte)
  127. except:
  128. dbs=1
  129. chs=1
  130. silhoutte=1
  131. data_df["ausrichtung"] = 1
  132. data_df = data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index()
  133. data_df.to_csv(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
  134. return data_df, n_clusters_, dbs, chs, silhoutte
  135. def cluster_and_preprocess(result,eps,path):
  136. start_time = time.time()
  137. result = get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays
  138. end_time = time.time()
  139. time_taken_get_average = end_time - start_time
  140. print("time get average: ", time_taken_get_average)
  141. start_time = time.time()
  142. result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
  143. end_time = time.time()
  144. time_taken_tocsv = end_time - start_time
  145. print("time to csv:" , time_taken_tocsv)
  146. with open(path+"/temporary/blub.csv") as csvfile:
  147. readCSV = csv.reader(csvfile, delimiter=';')
  148. result = list(readCSV)
  149. start_time = time.time()
  150. dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result])
  151. end_time = time.time()
  152. time_taken_dm = end_time - start_time
  153. print("time dm:" , time_taken_dm)
  154. start_time = time.time()
  155. clustering_result, n_clusters_, dbs, chs, silhoutte = clustering(dm,float(eps), path)
  156. end_time = time.time()
  157. time_taken_clustering = end_time - start_time
  158. print("time clustering:" , time_taken_clustering)
  159. return clustering_result, n_clusters_, dbs, chs, silhoutte, dm