dbscan_clustering.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import numpy as np
  2. import pandas
  3. from sklearn.cluster import DBSCAN
  4. from sklearn import metrics
  5. from sklearn.datasets.samples_generator import make_blobs
  6. from sklearn.preprocessing import StandardScaler
  7. def cluster(file_in, file_out):
  8. # #############################################################################
  9. data_df = pandas.read_csv("values_fromhtml_GV12.csv", sep=",")
  10. data_df.head(3)
  11. data = data_df[["X1","Y1","X2","Y2"]]
  12. print(data)
  13. data = StandardScaler().fit_transform(data)
  14. # #############################################################################
  15. # Compute DBSCAN
  16. db = DBSCAN(eps=0.2, min_samples=1).fit(data)
  17. core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  18. core_samples_mask[db.core_sample_indices_] = True
  19. labels = db.labels_
  20. print(data[labels == 0])
  21. data_df["cluster"] = labels
  22. # Number of clusters in labels, ignoring noise if present.
  23. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  24. n_noise_ = list(labels).count(-1)
  25. print('Estimated number of clusters: %d' % n_clusters_)
  26. print('Estimated number of noise points: %d' % n_noise_)
  27. print("Silhouette Coefficient: %0.3f"
  28. % metrics.silhouette_score(data, labels))
  29. # #############################################################################
  30. # Plot result
  31. """ort matplotlib.pyplot as plt
  32. # Black removed and is used for noise instead.
  33. unique_labels = set(labels)
  34. colors = [plt.cm.Spectral(each)
  35. for each in np.linspace(0, 1, len(unique_labels))]
  36. for k, col in zip(unique_labels, colors):
  37. if k == -1:
  38. # Black used for noise.
  39. col = [0, 0, 0, 1]
  40. class_member_mask = (labels == k)
  41. xy = data[class_member_mask & core_samples_mask]
  42. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
  43. markeredgecolor='k', markersize=14)
  44. xy = data[class_member_mask & ~core_samples_mask]
  45. plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
  46. markeredgecolor='k', markersize=6)
  47. plt.title('Estimated number of clusters: %d' % n_clusters_)
  48. plt.show()"""
  49. print(data_df.head(3))
  50. #data_df.to_csv("values_clusteredfromPDF_GV12.csv")
  51. data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("values_clusteredfromPDF_GV12.csv")
  52. cluster(33,33)