em.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. # Natural Language Toolkit: Expectation Maximization Clusterer
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import print_function, unicode_literals
  8. try:
  9. import numpy
  10. except ImportError:
  11. pass
  12. from nltk.compat import python_2_unicode_compatible
  13. from nltk.cluster.util import VectorSpaceClusterer
  14. @python_2_unicode_compatible
  15. class EMClusterer(VectorSpaceClusterer):
  16. """
  17. The Gaussian EM clusterer models the vectors as being produced by
  18. a mixture of k Gaussian sources. The parameters of these sources
  19. (prior probability, mean and covariance matrix) are then found to
  20. maximise the likelihood of the given data. This is done with the
  21. expectation maximisation algorithm. It starts with k arbitrarily
  22. chosen means, priors and covariance matrices. It then calculates
  23. the membership probabilities for each vector in each of the
  24. clusters; this is the 'E' step. The cluster parameters are then
  25. updated in the 'M' step using the maximum likelihood estimate from
  26. the cluster membership probabilities. This process continues until
  27. the likelihood of the data does not significantly increase.
  28. """
  29. def __init__(
  30. self,
  31. initial_means,
  32. priors=None,
  33. covariance_matrices=None,
  34. conv_threshold=1e-6,
  35. bias=0.1,
  36. normalise=False,
  37. svd_dimensions=None,
  38. ):
  39. """
  40. Creates an EM clusterer with the given starting parameters,
  41. convergence threshold and vector mangling parameters.
  42. :param initial_means: the means of the gaussian cluster centers
  43. :type initial_means: [seq of] numpy array or seq of SparseArray
  44. :param priors: the prior probability for each cluster
  45. :type priors: numpy array or seq of float
  46. :param covariance_matrices: the covariance matrix for each cluster
  47. :type covariance_matrices: [seq of] numpy array
  48. :param conv_threshold: maximum change in likelihood before deemed
  49. convergent
  50. :type conv_threshold: int or float
  51. :param bias: variance bias used to ensure non-singular covariance
  52. matrices
  53. :type bias: float
  54. :param normalise: should vectors be normalised to length 1
  55. :type normalise: boolean
  56. :param svd_dimensions: number of dimensions to use in reducing vector
  57. dimensionsionality with SVD
  58. :type svd_dimensions: int
  59. """
  60. VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
  61. self._means = numpy.array(initial_means, numpy.float64)
  62. self._num_clusters = len(initial_means)
  63. self._conv_threshold = conv_threshold
  64. self._covariance_matrices = covariance_matrices
  65. self._priors = priors
  66. self._bias = bias
  67. def num_clusters(self):
  68. return self._num_clusters
  69. def cluster_vectorspace(self, vectors, trace=False):
  70. assert len(vectors) > 0
  71. # set the parameters to initial values
  72. dimensions = len(vectors[0])
  73. means = self._means
  74. priors = self._priors
  75. if not priors:
  76. priors = self._priors = (
  77. numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters
  78. )
  79. covariances = self._covariance_matrices
  80. if not covariances:
  81. covariances = self._covariance_matrices = [
  82. numpy.identity(dimensions, numpy.float64)
  83. for i in range(self._num_clusters)
  84. ]
  85. # do the E and M steps until the likelihood plateaus
  86. lastl = self._loglikelihood(vectors, priors, means, covariances)
  87. converged = False
  88. while not converged:
  89. if trace:
  90. print('iteration; loglikelihood', lastl)
  91. # E-step, calculate hidden variables, h[i,j]
  92. h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64)
  93. for i in range(len(vectors)):
  94. for j in range(self._num_clusters):
  95. h[i, j] = priors[j] * self._gaussian(
  96. means[j], covariances[j], vectors[i]
  97. )
  98. h[i, :] /= sum(h[i, :])
  99. # M-step, update parameters - cvm, p, mean
  100. for j in range(self._num_clusters):
  101. covariance_before = covariances[j]
  102. new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64)
  103. new_mean = numpy.zeros(dimensions, numpy.float64)
  104. sum_hj = 0.0
  105. for i in range(len(vectors)):
  106. delta = vectors[i] - means[j]
  107. new_covariance += h[i, j] * numpy.multiply.outer(delta, delta)
  108. sum_hj += h[i, j]
  109. new_mean += h[i, j] * vectors[i]
  110. covariances[j] = new_covariance / sum_hj
  111. means[j] = new_mean / sum_hj
  112. priors[j] = sum_hj / len(vectors)
  113. # bias term to stop covariance matrix being singular
  114. covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64)
  115. # calculate likelihood - FIXME: may be broken
  116. l = self._loglikelihood(vectors, priors, means, covariances)
  117. # check for convergence
  118. if abs(lastl - l) < self._conv_threshold:
  119. converged = True
  120. lastl = l
  121. def classify_vectorspace(self, vector):
  122. best = None
  123. for j in range(self._num_clusters):
  124. p = self._priors[j] * self._gaussian(
  125. self._means[j], self._covariance_matrices[j], vector
  126. )
  127. if not best or p > best[0]:
  128. best = (p, j)
  129. return best[1]
  130. def likelihood_vectorspace(self, vector, cluster):
  131. cid = self.cluster_names().index(cluster)
  132. return self._priors[cluster] * self._gaussian(
  133. self._means[cluster], self._covariance_matrices[cluster], vector
  134. )
  135. def _gaussian(self, mean, cvm, x):
  136. m = len(mean)
  137. assert cvm.shape == (m, m), 'bad sized covariance matrix, %s' % str(cvm.shape)
  138. try:
  139. det = numpy.linalg.det(cvm)
  140. inv = numpy.linalg.inv(cvm)
  141. a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0)
  142. dx = x - mean
  143. print(dx, inv)
  144. b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx)
  145. return a * numpy.exp(b)
  146. except OverflowError:
  147. # happens when the exponent is negative infinity - i.e. b = 0
  148. # i.e. the inverse of cvm is huge (cvm is almost zero)
  149. return 0
  150. def _loglikelihood(self, vectors, priors, means, covariances):
  151. llh = 0.0
  152. for vector in vectors:
  153. p = 0
  154. for j in range(len(priors)):
  155. p += priors[j] * self._gaussian(means[j], covariances[j], vector)
  156. llh += numpy.log(p)
  157. return llh
  158. def __repr__(self):
  159. return '<EMClusterer means=%s>' % list(self._means)
  160. def demo():
  161. """
  162. Non-interactive demonstration of the clusterers with simple 2-D data.
  163. """
  164. from nltk import cluster
  165. # example from figure 14.10, page 519, Manning and Schutze
  166. vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]]
  167. means = [[4, 2], [4, 2.01]]
  168. clusterer = cluster.EMClusterer(means, bias=0.1)
  169. clusters = clusterer.cluster(vectors, True, trace=True)
  170. print('Clustered:', vectors)
  171. print('As: ', clusters)
  172. print()
  173. for c in range(2):
  174. print('Cluster:', c)
  175. print('Prior: ', clusterer._priors[c])
  176. print('Mean: ', clusterer._means[c])
  177. print('Covar: ', clusterer._covariance_matrices[c])
  178. print()
  179. # classify a new vector
  180. vector = numpy.array([2, 2])
  181. print('classify(%s):' % vector, end=' ')
  182. print(clusterer.classify(vector))
  183. # show the classification probabilities
  184. vector = numpy.array([2, 2])
  185. print('classification_probdist(%s):' % vector)
  186. pdist = clusterer.classification_probdist(vector)
  187. for sample in pdist.samples():
  188. print('%s => %.0f%%' % (sample, pdist.prob(sample) * 100))
  189. #
  190. # The following demo code is broken.
  191. #
  192. # # use a set of tokens with 2D indices
  193. # vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
  194. # # test the EM clusterer with means given by k-means (2) and
  195. # # dimensionality reduction
  196. # clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1)
  197. # print 'Clusterer:', clusterer
  198. # clusters = clusterer.cluster(vectors)
  199. # means = clusterer.means()
  200. # print 'Means:', clusterer.means()
  201. # print
  202. # clusterer = cluster.EMClusterer(means, svd_dimensions=1)
  203. # clusters = clusterer.cluster(vectors, True)
  204. # print 'Clusterer:', clusterer
  205. # print 'Clustered:', str(vectors)[:60], '...'
  206. # print 'As:', str(clusters)[:60], '...'
  207. # print
  208. # # classify a new vector
  209. # vector = numpy.array([3, 3])
  210. # print 'classify(%s):' % vector,
  211. # print clusterer.classify(vector)
  212. # print
  213. # # show the classification probabilities
  214. # vector = numpy.array([2.2, 2])
  215. # print 'classification_probdist(%s)' % vector
  216. # pdist = clusterer.classification_probdist(vector)
  217. # for sample in pdist:
  218. # print '%s => %.0f%%' % (sample, pdist.prob(sample) *100)
  219. if __name__ == '__main__':
  220. demo()