util.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. # Natural Language Toolkit: Clusterer Utilities
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # Contributor: J Richard Snape
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import print_function, unicode_literals, division
  9. from abc import abstractmethod
  10. import copy
  11. from sys import stdout
  12. from math import sqrt
  13. try:
  14. import numpy
  15. except ImportError:
  16. pass
  17. from nltk.cluster.api import ClusterI
  18. from nltk.compat import python_2_unicode_compatible
  19. class VectorSpaceClusterer(ClusterI):
  20. """
  21. Abstract clusterer which takes tokens and maps them into a vector space.
  22. Optionally performs singular value decomposition to reduce the
  23. dimensionality.
  24. """
  25. def __init__(self, normalise=False, svd_dimensions=None):
  26. """
  27. :param normalise: should vectors be normalised to length 1
  28. :type normalise: boolean
  29. :param svd_dimensions: number of dimensions to use in reducing vector
  30. dimensionsionality with SVD
  31. :type svd_dimensions: int
  32. """
  33. self._Tt = None
  34. self._should_normalise = normalise
  35. self._svd_dimensions = svd_dimensions
  36. def cluster(self, vectors, assign_clusters=False, trace=False):
  37. assert len(vectors) > 0
  38. # normalise the vectors
  39. if self._should_normalise:
  40. vectors = list(map(self._normalise, vectors))
  41. # use SVD to reduce the dimensionality
  42. if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
  43. [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors)))
  44. S = d[: self._svd_dimensions] * numpy.identity(
  45. self._svd_dimensions, numpy.float64
  46. )
  47. T = u[:, : self._svd_dimensions]
  48. Dt = vt[: self._svd_dimensions, :]
  49. vectors = numpy.transpose(numpy.dot(S, Dt))
  50. self._Tt = numpy.transpose(T)
  51. # call abstract method to cluster the vectors
  52. self.cluster_vectorspace(vectors, trace)
  53. # assign the vectors to clusters
  54. if assign_clusters:
  55. return [self.classify(vector) for vector in vectors]
  56. @abstractmethod
  57. def cluster_vectorspace(self, vectors, trace):
  58. """
  59. Finds the clusters using the given set of vectors.
  60. """
  61. def classify(self, vector):
  62. if self._should_normalise:
  63. vector = self._normalise(vector)
  64. if self._Tt is not None:
  65. vector = numpy.dot(self._Tt, vector)
  66. cluster = self.classify_vectorspace(vector)
  67. return self.cluster_name(cluster)
  68. @abstractmethod
  69. def classify_vectorspace(self, vector):
  70. """
  71. Returns the index of the appropriate cluster for the vector.
  72. """
  73. def likelihood(self, vector, label):
  74. if self._should_normalise:
  75. vector = self._normalise(vector)
  76. if self._Tt is not None:
  77. vector = numpy.dot(self._Tt, vector)
  78. return self.likelihood_vectorspace(vector, label)
  79. def likelihood_vectorspace(self, vector, cluster):
  80. """
  81. Returns the likelihood of the vector belonging to the cluster.
  82. """
  83. predicted = self.classify_vectorspace(vector)
  84. return 1.0 if cluster == predicted else 0.0
  85. def vector(self, vector):
  86. """
  87. Returns the vector after normalisation and dimensionality reduction
  88. """
  89. if self._should_normalise:
  90. vector = self._normalise(vector)
  91. if self._Tt is not None:
  92. vector = numpy.dot(self._Tt, vector)
  93. return vector
  94. def _normalise(self, vector):
  95. """
  96. Normalises the vector to unit length.
  97. """
  98. return vector / sqrt(numpy.dot(vector, vector))
  99. def euclidean_distance(u, v):
  100. """
  101. Returns the euclidean distance between vectors u and v. This is equivalent
  102. to the length of the vector (u - v).
  103. """
  104. diff = u - v
  105. return sqrt(numpy.dot(diff, diff))
  106. def cosine_distance(u, v):
  107. """
  108. Returns 1 minus the cosine of the angle between vectors v and u. This is
  109. equal to 1 - (u.v / |u||v|).
  110. """
  111. return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
  112. class _DendrogramNode(object):
  113. """ Tree node of a dendrogram. """
  114. def __init__(self, value, *children):
  115. self._value = value
  116. self._children = children
  117. def leaves(self, values=True):
  118. if self._children:
  119. leaves = []
  120. for child in self._children:
  121. leaves.extend(child.leaves(values))
  122. return leaves
  123. elif values:
  124. return [self._value]
  125. else:
  126. return [self]
  127. def groups(self, n):
  128. queue = [(self._value, self)]
  129. while len(queue) < n:
  130. priority, node = queue.pop()
  131. if not node._children:
  132. queue.push((priority, node))
  133. break
  134. for child in node._children:
  135. if child._children:
  136. queue.append((child._value, child))
  137. else:
  138. queue.append((0, child))
  139. # makes the earliest merges at the start, latest at the end
  140. queue.sort()
  141. groups = []
  142. for priority, node in queue:
  143. groups.append(node.leaves())
  144. return groups
  145. def __lt__(self, comparator):
  146. return cosine_distance(self._value, comparator._value) < 0
  147. @python_2_unicode_compatible
  148. class Dendrogram(object):
  149. """
  150. Represents a dendrogram, a tree with a specified branching order. This
  151. must be initialised with the leaf items, then iteratively call merge for
  152. each branch. This class constructs a tree representing the order of calls
  153. to the merge function.
  154. """
  155. def __init__(self, items=[]):
  156. """
  157. :param items: the items at the leaves of the dendrogram
  158. :type items: sequence of (any)
  159. """
  160. self._items = [_DendrogramNode(item) for item in items]
  161. self._original_items = copy.copy(self._items)
  162. self._merge = 1
  163. def merge(self, *indices):
  164. """
  165. Merges nodes at given indices in the dendrogram. The nodes will be
  166. combined which then replaces the first node specified. All other nodes
  167. involved in the merge will be removed.
  168. :param indices: indices of the items to merge (at least two)
  169. :type indices: seq of int
  170. """
  171. assert len(indices) >= 2
  172. node = _DendrogramNode(self._merge, *[self._items[i] for i in indices])
  173. self._merge += 1
  174. self._items[indices[0]] = node
  175. for i in indices[1:]:
  176. del self._items[i]
  177. def groups(self, n):
  178. """
  179. Finds the n-groups of items (leaves) reachable from a cut at depth n.
  180. :param n: number of groups
  181. :type n: int
  182. """
  183. if len(self._items) > 1:
  184. root = _DendrogramNode(self._merge, *self._items)
  185. else:
  186. root = self._items[0]
  187. return root.groups(n)
  188. def show(self, leaf_labels=[]):
  189. """
  190. Print the dendrogram in ASCII art to standard out.
  191. :param leaf_labels: an optional list of strings to use for labeling the
  192. leaves
  193. :type leaf_labels: list
  194. """
  195. # ASCII rendering characters
  196. JOIN, HLINK, VLINK = '+', '-', '|'
  197. # find the root (or create one)
  198. if len(self._items) > 1:
  199. root = _DendrogramNode(self._merge, *self._items)
  200. else:
  201. root = self._items[0]
  202. leaves = self._original_items
  203. if leaf_labels:
  204. last_row = leaf_labels
  205. else:
  206. last_row = ["%s" % leaf._value for leaf in leaves]
  207. # find the bottom row and the best cell width
  208. width = max(map(len, last_row)) + 1
  209. lhalf = width // 2
  210. rhalf = int(width - lhalf - 1)
  211. # display functions
  212. def format(centre, left=' ', right=' '):
  213. return '%s%s%s' % (lhalf * left, centre, right * rhalf)
  214. def display(str):
  215. stdout.write(str)
  216. # for each merge, top down
  217. queue = [(root._value, root)]
  218. verticals = [format(' ') for leaf in leaves]
  219. while queue:
  220. priority, node = queue.pop()
  221. child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children))
  222. indices = list(map(leaves.index, child_left_leaf))
  223. if child_left_leaf:
  224. min_idx = min(indices)
  225. max_idx = max(indices)
  226. for i in range(len(leaves)):
  227. if leaves[i] in child_left_leaf:
  228. if i == min_idx:
  229. display(format(JOIN, ' ', HLINK))
  230. elif i == max_idx:
  231. display(format(JOIN, HLINK, ' '))
  232. else:
  233. display(format(JOIN, HLINK, HLINK))
  234. verticals[i] = format(VLINK)
  235. elif min_idx <= i <= max_idx:
  236. display(format(HLINK, HLINK, HLINK))
  237. else:
  238. display(verticals[i])
  239. display('\n')
  240. for child in node._children:
  241. if child._children:
  242. queue.append((child._value, child))
  243. queue.sort()
  244. for vertical in verticals:
  245. display(vertical)
  246. display('\n')
  247. # finally, display the last line
  248. display(''.join(item.center(width) for item in last_row))
  249. display('\n')
  250. def __repr__(self):
  251. if len(self._items) > 1:
  252. root = _DendrogramNode(self._merge, *self._items)
  253. else:
  254. root = self._items[0]
  255. leaves = root.leaves(False)
  256. return '<Dendrogram with %d leaves>' % len(leaves)