kmeans.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. # Natural Language Toolkit: K-Means Clusterer
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import print_function, unicode_literals, division
  8. import copy
  9. import random
  10. import sys
  11. try:
  12. import numpy
  13. except ImportError:
  14. pass
  15. from nltk.cluster.util import VectorSpaceClusterer
  16. from nltk.compat import python_2_unicode_compatible
  17. @python_2_unicode_compatible
  18. class KMeansClusterer(VectorSpaceClusterer):
  19. """
  20. The K-means clusterer starts with k arbitrary chosen means then allocates
  21. each vector to the cluster with the closest mean. It then recalculates the
  22. means of each cluster as the centroid of the vectors in the cluster. This
  23. process repeats until the cluster memberships stabilise. This is a
  24. hill-climbing algorithm which may converge to a local maximum. Hence the
  25. clustering is often repeated with random initial means and the most
  26. commonly occurring output means are chosen.
  27. """
  28. def __init__(
  29. self,
  30. num_means,
  31. distance,
  32. repeats=1,
  33. conv_test=1e-6,
  34. initial_means=None,
  35. normalise=False,
  36. svd_dimensions=None,
  37. rng=None,
  38. avoid_empty_clusters=False,
  39. ):
  40. """
  41. :param num_means: the number of means to use (may use fewer)
  42. :type num_means: int
  43. :param distance: measure of distance between two vectors
  44. :type distance: function taking two vectors and returing a float
  45. :param repeats: number of randomised clustering trials to use
  46. :type repeats: int
  47. :param conv_test: maximum variation in mean differences before
  48. deemed convergent
  49. :type conv_test: number
  50. :param initial_means: set of k initial means
  51. :type initial_means: sequence of vectors
  52. :param normalise: should vectors be normalised to length 1
  53. :type normalise: boolean
  54. :param svd_dimensions: number of dimensions to use in reducing vector
  55. dimensionsionality with SVD
  56. :type svd_dimensions: int
  57. :param rng: random number generator (or None)
  58. :type rng: Random
  59. :param avoid_empty_clusters: include current centroid in computation
  60. of next one; avoids undefined behavior
  61. when clusters become empty
  62. :type avoid_empty_clusters: boolean
  63. """
  64. VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
  65. self._num_means = num_means
  66. self._distance = distance
  67. self._max_difference = conv_test
  68. assert not initial_means or len(initial_means) == num_means
  69. self._means = initial_means
  70. assert repeats >= 1
  71. assert not (initial_means and repeats > 1)
  72. self._repeats = repeats
  73. self._rng = rng if rng else random.Random()
  74. self._avoid_empty_clusters = avoid_empty_clusters
  75. def cluster_vectorspace(self, vectors, trace=False):
  76. if self._means and self._repeats > 1:
  77. print('Warning: means will be discarded for subsequent trials')
  78. meanss = []
  79. for trial in range(self._repeats):
  80. if trace:
  81. print('k-means trial', trial)
  82. if not self._means or trial > 1:
  83. self._means = self._rng.sample(list(vectors), self._num_means)
  84. self._cluster_vectorspace(vectors, trace)
  85. meanss.append(self._means)
  86. if len(meanss) > 1:
  87. # sort the means first (so that different cluster numbering won't
  88. # effect the distance comparison)
  89. for means in meanss:
  90. means.sort(key=sum)
  91. # find the set of means that's minimally different from the others
  92. min_difference = min_means = None
  93. for i in range(len(meanss)):
  94. d = 0
  95. for j in range(len(meanss)):
  96. if i != j:
  97. d += self._sum_distances(meanss[i], meanss[j])
  98. if min_difference is None or d < min_difference:
  99. min_difference, min_means = d, meanss[i]
  100. # use the best means
  101. self._means = min_means
  102. def _cluster_vectorspace(self, vectors, trace=False):
  103. if self._num_means < len(vectors):
  104. # perform k-means clustering
  105. converged = False
  106. while not converged:
  107. # assign the tokens to clusters based on minimum distance to
  108. # the cluster means
  109. clusters = [[] for m in range(self._num_means)]
  110. for vector in vectors:
  111. index = self.classify_vectorspace(vector)
  112. clusters[index].append(vector)
  113. if trace:
  114. print('iteration')
  115. # for i in range(self._num_means):
  116. # print ' mean', i, 'allocated', len(clusters[i]), 'vectors'
  117. # recalculate cluster means by computing the centroid of each cluster
  118. new_means = list(map(self._centroid, clusters, self._means))
  119. # measure the degree of change from the previous step for convergence
  120. difference = self._sum_distances(self._means, new_means)
  121. if difference < self._max_difference:
  122. converged = True
  123. # remember the new means
  124. self._means = new_means
  125. def classify_vectorspace(self, vector):
  126. # finds the closest cluster centroid
  127. # returns that cluster's index
  128. best_distance = best_index = None
  129. for index in range(len(self._means)):
  130. mean = self._means[index]
  131. dist = self._distance(vector, mean)
  132. if best_distance is None or dist < best_distance:
  133. best_index, best_distance = index, dist
  134. return best_index
  135. def num_clusters(self):
  136. if self._means:
  137. return len(self._means)
  138. else:
  139. return self._num_means
  140. def means(self):
  141. """
  142. The means used for clustering.
  143. """
  144. return self._means
  145. def _sum_distances(self, vectors1, vectors2):
  146. difference = 0.0
  147. for u, v in zip(vectors1, vectors2):
  148. difference += self._distance(u, v)
  149. return difference
  150. def _centroid(self, cluster, mean):
  151. if self._avoid_empty_clusters:
  152. centroid = copy.copy(mean)
  153. for vector in cluster:
  154. centroid += vector
  155. return centroid / (1 + len(cluster))
  156. else:
  157. if not len(cluster):
  158. sys.stderr.write('Error: no centroid defined for empty cluster.\n')
  159. sys.stderr.write(
  160. 'Try setting argument \'avoid_empty_clusters\' to True\n'
  161. )
  162. assert False
  163. centroid = copy.copy(cluster[0])
  164. for vector in cluster[1:]:
  165. centroid += vector
  166. return centroid / len(cluster)
  167. def __repr__(self):
  168. return '<KMeansClusterer means=%s repeats=%d>' % (self._means, self._repeats)
  169. #################################################################################
  170. def demo():
  171. # example from figure 14.9, page 517, Manning and Schutze
  172. from nltk.cluster import KMeansClusterer, euclidean_distance
  173. vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
  174. means = [[4, 3], [5, 5]]
  175. clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means)
  176. clusters = clusterer.cluster(vectors, True, trace=True)
  177. print('Clustered:', vectors)
  178. print('As:', clusters)
  179. print('Means:', clusterer.means())
  180. print()
  181. vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
  182. # test k-means using the euclidean distance metric, 2 means and repeat
  183. # clustering 10 times with random seeds
  184. clusterer = KMeansClusterer(2, euclidean_distance, repeats=10)
  185. clusters = clusterer.cluster(vectors, True)
  186. print('Clustered:', vectors)
  187. print('As:', clusters)
  188. print('Means:', clusterer.means())
  189. print()
  190. # classify a new vector
  191. vector = numpy.array([3, 3])
  192. print('classify(%s):' % vector, end=' ')
  193. print(clusterer.classify(vector))
  194. print()
  195. if __name__ == '__main__':
  196. demo()