dbscan_clustering.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. import numpy as np
  2. import pandas
  3. import csv
  4. import order_bounding_boxes_in_each_block
  5. from sklearn.cluster import DBSCAN
  6. from sklearn import metrics
  7. from sklearn.datasets.samples_generator import make_blobs
  8. from sklearn.preprocessing import StandardScaler
  9. def my_distance(x,y):
  10. blub = "ddd"
  11. return blub
  12. def cluster(file_in, file_out):
  13. # #############################################################################
  14. data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/engineering_drawings_extraction/temporary/list_to_csv_with_avg_points.csv", sep=";")
  15. data_df.head(3)
  16. data = data_df[["xavg_elem","yavg_elem","ausrichtung"]]
  17. #print(data)
  18. data = StandardScaler().fit_transform(data)
  19. # #############################################################################
  20. # Compute DBSCAN
  21. db = DBSCAN(eps=0.075, min_samples=1, metric="euclidean").fit(data)
  22. #core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  23. #core_samples_mask[db.core_sample_indices_] = True
  24. labels = db.labels_
  25. print(data[labels == 0])
  26. data_df["cluster"] = labels
  27. # Number of clusters in labels, ignoring noise if present.
  28. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  29. n_noise_ = list(labels).count(-1)
  30. print('Estimated number of clusters: %d' % n_clusters_)
  31. print('Estimated number of noise points: %d' % n_noise_)
  32. print("Silhouette Coefficient: %0.3f"
  33. % metrics.silhouette_score(data, labels))
  34. # #############################################################################
  35. # Plot result
  36. """ort matplotlib.pyplot as plt
  37. # Black removed and is used for noise instead.
  38. unique_labels = set(labels)
  39. colors = [plt.cm.Spectral(each)
  40. for each in np.linspace(0, 1, len(unique_labels))]
  41. for k, col in zip(unique_labels, colors):
  42. if k == -1:
  43. # Black used for noise.
  44. col = [0, 0, 0, 1]
  45. class_member_mask = (labels == k)
  46. xy = data[class_member_mask & core_samples_mask]
  47. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
  48. markeredgecolor='k', markersize=14)
  49. xy = data[class_member_mask & ~core_samples_mask]
  50. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
  51. markeredgecolor='k', markersize=6)
  52. plt.title('Estimated number of clusters: %d' % n_clusters_)
  53. plt.show()"""
  54. #print(data_df.head(3))
  55. #data_df.to_csv("values_clusteredfromPDF_GV12.csv")
  56. data_df.groupby('cluster')['element'].apply(' '.join).reset_index().to_csv("values_clusteredfromHTML_layout_LH.csv",sep=";")
  57. def get_average_xy(list_input):
  58. csv_name = "temporary/list_to_csv_with_avg_points.csv"
  59. new_list = []
  60. resultFile = open(csv_name, 'w')
  61. wr = csv.writer(resultFile, delimiter=";")
  62. wr.writerow(["element", "xavg_elem","yavg_elem", "ausrichtung"])
  63. for element in list_input:
  64. xavg_elem = 0
  65. yavg_elem = 0
  66. y_min = 1000000
  67. y_max = 0
  68. x_min = 1000000
  69. x_max = 0
  70. for blub in element:
  71. xavg_elem += (float(blub[0]) + float(blub[2]))/2
  72. yavg_elem += (float(blub[1]) + float(blub[3]))/2
  73. if float(blub[1]) < y_min:
  74. y_min = float(blub[1])
  75. #print("y_min:",y_min)
  76. if float(blub[0]) < x_min:
  77. x_min = float(blub[0])
  78. if float(blub[3]) > y_max:
  79. y_max = float(blub[3])
  80. if float(blub[2]) > x_max:
  81. x_max = float(blub[2])
  82. if x_max-x_min > y_max-y_min:
  83. ausrichtung = 0
  84. else:
  85. ausrichtung = 1
  86. xavg_elem = xavg_elem/len(element)
  87. #print(xavg_elem)
  88. yavg_elem = yavg_elem/len(element)
  89. #element.extend([xavg_elem, yavg_elem])
  90. #print(element)
  91. #new_list.append(element)
  92. wr.writerow([element,xavg_elem,yavg_elem, ausrichtung])
  93. resultFile.close()
  94. #print(new_list)
  95. return csv_name
  96. #cluster(33,33)
  97. file = "/home/bscheibel/PycharmProjects/engineering_drawings_extraction/drawings/5152166_Rev04.html"
  98. #file = "/home/bscheibel/PycharmProjects/engineering_drawings_extraction/drawings/5129275_Rev01-GV12.html"
  99. #result = order_bounding_boxes_in_each_block.get_bound_box(file)
  100. #get_average_xy(result)
  101. cluster(33,33)