dbscan_clustering.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import numpy as np
  2. import pandas
  3. import csv
  4. import order_bounding_boxes_in_each_block
  5. from sklearn.cluster import DBSCAN
  6. from sklearn import metrics
  7. from sklearn.datasets.samples_generator import make_blobs
  8. from sklearn.preprocessing import StandardScaler
  9. def cluster(file_in, file_out):
  10. # #############################################################################
  11. data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_avg_points.csv", sep=";")
  12. data_df.head(3)
  13. data = data_df[["xavg_elem","yavg_elem"]]
  14. print(data)
  15. data = StandardScaler().fit_transform(data)
  16. # #############################################################################
  17. # Compute DBSCAN
  18. db = DBSCAN(eps=0.1, min_samples=1).fit(data)
  19. core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  20. core_samples_mask[db.core_sample_indices_] = True
  21. labels = db.labels_
  22. print(data[labels == 0])
  23. data_df["cluster"] = labels
  24. # Number of clusters in labels, ignoring noise if present.
  25. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  26. n_noise_ = list(labels).count(-1)
  27. print('Estimated number of clusters: %d' % n_clusters_)
  28. print('Estimated number of noise points: %d' % n_noise_)
  29. print("Silhouette Coefficient: %0.3f"
  30. % metrics.silhouette_score(data, labels))
  31. # #############################################################################
  32. # Plot result
  33. """ort matplotlib.pyplot as plt
  34. # Black removed and is used for noise instead.
  35. unique_labels = set(labels)
  36. colors = [plt.cm.Spectral(each)
  37. for each in np.linspace(0, 1, len(unique_labels))]
  38. for k, col in zip(unique_labels, colors):
  39. if k == -1:
  40. # Black used for noise.
  41. col = [0, 0, 0, 1]
  42. class_member_mask = (labels == k)
  43. xy = data[class_member_mask & core_samples_mask]
  44. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
  45. markeredgecolor='k', markersize=14)
  46. xy = data[class_member_mask & ~core_samples_mask]
  47. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
  48. markeredgecolor='k', markersize=6)
  49. plt.title('Estimated number of clusters: %d' % n_clusters_)
  50. plt.show()"""
  51. #print(data_df.head(3))
  52. #data_df.to_csv("values_clusteredfromPDF_GV12.csv")
  53. data_df.groupby('cluster')['element'].apply(' '.join).reset_index().to_csv("values_clusteredfromHTML_layout_LH.csv", delimiter=";")
  54. def get_average_xy(list_input):
  55. csv_name = "temporary/list_to_csv_with_avg_points.csv"
  56. new_list = []
  57. resultFile = open(csv_name, 'a')
  58. wr = csv.writer(resultFile, delimiter=";")
  59. wr.writerow(["element", "xavg_elem","yavg_elem"])
  60. for element in list_input:
  61. xavg_elem = 0
  62. yavg_elem = 0
  63. for blub in element:
  64. xavg_elem += (float(blub[0]) + float(blub[2]))/2
  65. yavg_elem += (float(blub[1]) + float(blub[3]))/2
  66. xavg_elem = xavg_elem/len(element)
  67. #print(xavg_elem)
  68. yavg_elem = yavg_elem/len(element)
  69. #element.extend([xavg_elem, yavg_elem])
  70. #print(element)
  71. #new_list.append(element)
  72. wr.writerow([element,xavg_elem,yavg_elem])
  73. resultFile.close()
  74. #print(new_list)
  75. return csv_name
  76. #cluster(33,33)
  77. #result = order_bounding_boxes_in_each_block.get_bound_box()
  78. #get_average_xy(result)
  79. cluster(33,33)