api.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # Natural Language Toolkit: Clusterer Interfaces
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # Porting: Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from abc import ABCMeta, abstractmethod
  9. from six import add_metaclass
  10. from nltk.probability import DictionaryProbDist
  11. @add_metaclass(ABCMeta)
  12. class ClusterI(object):
  13. """
  14. Interface covering basic clustering functionality.
  15. """
  16. @abstractmethod
  17. def cluster(self, vectors, assign_clusters=False):
  18. """
  19. Assigns the vectors to clusters, learning the clustering parameters
  20. from the data. Returns a cluster identifier for each vector.
  21. """
  22. @abstractmethod
  23. def classify(self, token):
  24. """
  25. Classifies the token into a cluster, setting the token's CLUSTER
  26. parameter to that cluster identifier.
  27. """
  28. def likelihood(self, vector, label):
  29. """
  30. Returns the likelihood (a float) of the token having the
  31. corresponding cluster.
  32. """
  33. if self.classify(vector) == label:
  34. return 1.0
  35. else:
  36. return 0.0
  37. def classification_probdist(self, vector):
  38. """
  39. Classifies the token into a cluster, returning
  40. a probability distribution over the cluster identifiers.
  41. """
  42. likelihoods = {}
  43. sum = 0.0
  44. for cluster in self.cluster_names():
  45. likelihoods[cluster] = self.likelihood(vector, cluster)
  46. sum += likelihoods[cluster]
  47. for cluster in self.cluster_names():
  48. likelihoods[cluster] /= sum
  49. return DictionaryProbDist(likelihoods)
  50. @abstractmethod
  51. def num_clusters(self):
  52. """
  53. Returns the number of clusters.
  54. """
  55. def cluster_names(self):
  56. """
  57. Returns the names of the clusters.
  58. :rtype: list
  59. """
  60. return list(range(self.num_clusters()))
  61. def cluster_name(self, index):
  62. """
  63. Returns the names of the cluster at index.
  64. """
  65. return index