1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- print(__doc__)
- import numpy as np
- import pandas
- from sklearn.cluster import DBSCAN
- from sklearn import metrics
- from sklearn.datasets.samples_generator import make_blobs
- from sklearn.preprocessing import StandardScaler
- # #############################################################################
- data_df = pandas.read_csv("values.csv", sep=",")
- data_df.head(3)
- data = data_df[["X1","Y1","X2","Y2"]]
- print(data)
- data = StandardScaler().fit_transform(data)
- # #############################################################################
- # Compute DBSCAN
- db = DBSCAN(eps=0.15, min_samples=1).fit(data)
- core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
- core_samples_mask[db.core_sample_indices_] = True
- labels = db.labels_
- print(data[labels == 0])
- data_df["cluster"] = labels
- # Number of clusters in labels, ignoring noise if present.
- n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
- n_noise_ = list(labels).count(-1)
- print('Estimated number of clusters: %d' % n_clusters_)
- print('Estimated number of noise points: %d' % n_noise_)
- print("Silhouette Coefficient: %0.3f"
- % metrics.silhouette_score(data, labels))
- # #############################################################################
- # Plot result
- import matplotlib.pyplot as plt
- # Black removed and is used for noise instead.
- unique_labels = set(labels)
- colors = [plt.cm.Spectral(each)
- for each in np.linspace(0, 1, len(unique_labels))]
- for k, col in zip(unique_labels, colors):
- if k == -1:
- # Black used for noise.
- col = [0, 0, 0, 1]
- class_member_mask = (labels == k)
- xy = data[class_member_mask & core_samples_mask]
- plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
- markeredgecolor='k', markersize=14)
- xy = data[class_member_mask & ~core_samples_mask]
- plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
- markeredgecolor='k', markersize=6)
- plt.title('Estimated number of clusters: %d' % n_clusters_)
- plt.show()
- print(data_df.head(3))
- data_df.to_csv("values_clustered.csv")
- data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged.csv")
|