123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- # Natural Language Toolkit: Group Average Agglomerative Clusterer
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function, unicode_literals, division
- try:
- import numpy
- except ImportError:
- pass
- from nltk.cluster.util import VectorSpaceClusterer, Dendrogram, cosine_distance
- from nltk.compat import python_2_unicode_compatible
- @python_2_unicode_compatible
- class GAAClusterer(VectorSpaceClusterer):
- """
- The Group Average Agglomerative starts with each of the N vectors as singleton
- clusters. It then iteratively merges pairs of clusters which have the
- closest centroids. This continues until there is only one cluster. The
- order of merges gives rise to a dendrogram: a tree with the earlier merges
- lower than later merges. The membership of a given number of clusters c, 1
- <= c <= N, can be found by cutting the dendrogram at depth c.
- This clusterer uses the cosine similarity metric only, which allows for
- efficient speed-up in the clustering process.
- """
- def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
- VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
- self._num_clusters = num_clusters
- self._dendrogram = None
- self._groups_values = None
- def cluster(self, vectors, assign_clusters=False, trace=False):
- # stores the merge order
- self._dendrogram = Dendrogram(
- [numpy.array(vector, numpy.float64) for vector in vectors]
- )
- return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
- def cluster_vectorspace(self, vectors, trace=False):
- # variables describing the initial situation
- N = len(vectors)
- cluster_len = [1] * N
- cluster_count = N
- index_map = numpy.arange(N)
- # construct the similarity matrix
- dims = (N, N)
- dist = numpy.ones(dims, dtype=numpy.float) * numpy.inf
- for i in range(N):
- for j in range(i + 1, N):
- dist[i, j] = cosine_distance(vectors[i], vectors[j])
- while cluster_count > max(self._num_clusters, 1):
- i, j = numpy.unravel_index(dist.argmin(), dims)
- if trace:
- print("merging %d and %d" % (i, j))
- # update similarities for merging i and j
- self._merge_similarities(dist, cluster_len, i, j)
- # remove j
- dist[:, j] = numpy.inf
- dist[j, :] = numpy.inf
- # merge the clusters
- cluster_len[i] = cluster_len[i] + cluster_len[j]
- self._dendrogram.merge(index_map[i], index_map[j])
- cluster_count -= 1
- # update the index map to reflect the indexes if we
- # had removed j
- index_map[j + 1 :] -= 1
- index_map[j] = N
- self.update_clusters(self._num_clusters)
- def _merge_similarities(self, dist, cluster_len, i, j):
- # the new cluster i merged from i and j adopts the average of
- # i and j's similarity to each other cluster, weighted by the
- # number of points in the clusters i and j
- i_weight = cluster_len[i]
- j_weight = cluster_len[j]
- weight_sum = i_weight + j_weight
- # update for x<i
- dist[:i, i] = dist[:i, i] * i_weight + dist[:i, j] * j_weight
- dist[:i, i] /= weight_sum
- # update for i<x<j
- dist[i, i + 1 : j] = (
- dist[i, i + 1 : j] * i_weight + dist[i + 1 : j, j] * j_weight
- )
- # update for i<j<x
- dist[i, j + 1 :] = dist[i, j + 1 :] * i_weight + dist[j, j + 1 :] * j_weight
- dist[i, i + 1 :] /= weight_sum
- def update_clusters(self, num_clusters):
- clusters = self._dendrogram.groups(num_clusters)
- self._centroids = []
- for cluster in clusters:
- assert len(cluster) > 0
- if self._should_normalise:
- centroid = self._normalise(cluster[0])
- else:
- centroid = numpy.array(cluster[0])
- for vector in cluster[1:]:
- if self._should_normalise:
- centroid += self._normalise(vector)
- else:
- centroid += vector
- centroid /= len(cluster)
- self._centroids.append(centroid)
- self._num_clusters = len(self._centroids)
- def classify_vectorspace(self, vector):
- best = None
- for i in range(self._num_clusters):
- centroid = self._centroids[i]
- dist = cosine_distance(vector, centroid)
- if not best or dist < best[0]:
- best = (dist, i)
- return best[1]
- def dendrogram(self):
- """
- :return: The dendrogram representing the current clustering
- :rtype: Dendrogram
- """
- return self._dendrogram
- def num_clusters(self):
- return self._num_clusters
- def __repr__(self):
- return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
- def demo():
- """
- Non-interactive demonstration of the clusterers with simple 2-D data.
- """
- from nltk.cluster import GAAClusterer
- # use a set of tokens with 2D indices
- vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
- # test the GAAC clusterer with 4 clusters
- clusterer = GAAClusterer(4)
- clusters = clusterer.cluster(vectors, True)
- print('Clusterer:', clusterer)
- print('Clustered:', vectors)
- print('As:', clusters)
- print()
- # show the dendrogram
- clusterer.dendrogram().show()
- # classify a new vector
- vector = numpy.array([3, 3])
- print('classify(%s):' % vector, end=' ')
- print(clusterer.classify(vector))
- print()
- if __name__ == '__main__':
- demo()
|