naivebayes.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # Natural Language Toolkit: Naive Bayes Classifiers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A classifier based on the Naive Bayes algorithm. In order to find the
  9. probability for a label, this algorithm first uses the Bayes rule to
  10. express P(label|features) in terms of P(label) and P(features|label):
  11. | P(label) * P(features|label)
  12. | P(label|features) = ------------------------------
  13. | P(features)
  14. The algorithm then makes the 'naive' assumption that all features are
  15. independent, given the label:
  16. | P(label) * P(f1|label) * ... * P(fn|label)
  17. | P(label|features) = --------------------------------------------
  18. | P(features)
  19. Rather than computing P(features) explicitly, the algorithm just
  20. calculates the numerator for each label, and normalizes them so they
  21. sum to one:
  22. | P(label) * P(f1|label) * ... * P(fn|label)
  23. | P(label|features) = --------------------------------------------
  24. | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
  25. """
  26. from __future__ import print_function, unicode_literals
  27. from collections import defaultdict
  28. from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
  29. from nltk.classify.api import ClassifierI
  30. ##//////////////////////////////////////////////////////
  31. ## Naive Bayes Classifier
  32. ##//////////////////////////////////////////////////////
  33. class NaiveBayesClassifier(ClassifierI):
  34. """
  35. A Naive Bayes classifier. Naive Bayes classifiers are
  36. paramaterized by two probability distributions:
  37. - P(label) gives the probability that an input will receive each
  38. label, given no information about the input's features.
  39. - P(fname=fval|label) gives the probability that a given feature
  40. (fname) will receive a given value (fval), given that the
  41. label (label).
  42. If the classifier encounters an input with a feature that has
  43. never been seen with any label, then rather than assigning a
  44. probability of 0 to all labels, it will ignore that feature.
  45. The feature value 'None' is reserved for unseen feature values;
  46. you generally should not use 'None' as a feature value for one of
  47. your own features.
  48. """
  49. def __init__(self, label_probdist, feature_probdist):
  50. """
  51. :param label_probdist: P(label), the probability distribution
  52. over labels. It is expressed as a ``ProbDistI`` whose
  53. samples are labels. I.e., P(label) =
  54. ``label_probdist.prob(label)``.
  55. :param feature_probdist: P(fname=fval|label), the probability
  56. distribution for feature values, given labels. It is
  57. expressed as a dictionary whose keys are ``(label, fname)``
  58. pairs and whose values are ``ProbDistI`` objects over feature
  59. values. I.e., P(fname=fval|label) =
  60. ``feature_probdist[label,fname].prob(fval)``. If a given
  61. ``(label,fname)`` is not a key in ``feature_probdist``, then
  62. it is assumed that the corresponding P(fname=fval|label)
  63. is 0 for all values of ``fval``.
  64. """
  65. self._label_probdist = label_probdist
  66. self._feature_probdist = feature_probdist
  67. self._labels = list(label_probdist.samples())
  68. def labels(self):
  69. return self._labels
  70. def classify(self, featureset):
  71. return self.prob_classify(featureset).max()
  72. def prob_classify(self, featureset):
  73. # Discard any feature names that we've never seen before.
  74. # Otherwise, we'll just assign a probability of 0 to
  75. # everything.
  76. featureset = featureset.copy()
  77. for fname in list(featureset.keys()):
  78. for label in self._labels:
  79. if (label, fname) in self._feature_probdist:
  80. break
  81. else:
  82. # print 'Ignoring unseen feature %s' % fname
  83. del featureset[fname]
  84. # Find the log probabilty of each label, given the features.
  85. # Start with the log probability of the label itself.
  86. logprob = {}
  87. for label in self._labels:
  88. logprob[label] = self._label_probdist.logprob(label)
  89. # Then add in the log probability of features given labels.
  90. for label in self._labels:
  91. for (fname, fval) in featureset.items():
  92. if (label, fname) in self._feature_probdist:
  93. feature_probs = self._feature_probdist[label, fname]
  94. logprob[label] += feature_probs.logprob(fval)
  95. else:
  96. # nb: This case will never come up if the
  97. # classifier was created by
  98. # NaiveBayesClassifier.train().
  99. logprob[label] += sum_logs([]) # = -INF.
  100. return DictionaryProbDist(logprob, normalize=True, log=True)
  101. def show_most_informative_features(self, n=10):
  102. # Determine the most relevant features, and display them.
  103. cpdist = self._feature_probdist
  104. print('Most Informative Features')
  105. for (fname, fval) in self.most_informative_features(n):
  106. def labelprob(l):
  107. return cpdist[l, fname].prob(fval)
  108. labels = sorted(
  109. [l for l in self._labels if fval in cpdist[l, fname].samples()],
  110. key=labelprob,
  111. )
  112. if len(labels) == 1:
  113. continue
  114. l0 = labels[0]
  115. l1 = labels[-1]
  116. if cpdist[l0, fname].prob(fval) == 0:
  117. ratio = 'INF'
  118. else:
  119. ratio = '%8.1f' % (
  120. cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
  121. )
  122. print(
  123. (
  124. '%24s = %-14r %6s : %-6s = %s : 1.0'
  125. % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
  126. )
  127. )
  128. def most_informative_features(self, n=100):
  129. """
  130. Return a list of the 'most informative' features used by this
  131. classifier. For the purpose of this function, the
  132. informativeness of a feature ``(fname,fval)`` is equal to the
  133. highest value of P(fname=fval|label), for any label, divided by
  134. the lowest value of P(fname=fval|label), for any label:
  135. | max[ P(fname=fval|label1) / P(fname=fval|label2) ]
  136. """
  137. if hasattr(self, '_most_informative_features'):
  138. return self._most_informative_features[:n]
  139. else:
  140. # The set of (fname, fval) pairs used by this classifier.
  141. features = set()
  142. # The max & min probability associated w/ each (fname, fval)
  143. # pair. Maps (fname,fval) -> float.
  144. maxprob = defaultdict(lambda: 0.0)
  145. minprob = defaultdict(lambda: 1.0)
  146. for (label, fname), probdist in self._feature_probdist.items():
  147. for fval in probdist.samples():
  148. feature = (fname, fval)
  149. features.add(feature)
  150. p = probdist.prob(fval)
  151. maxprob[feature] = max(p, maxprob[feature])
  152. minprob[feature] = min(p, minprob[feature])
  153. if minprob[feature] == 0:
  154. features.discard(feature)
  155. # Convert features to a list, & sort it by how informative
  156. # features are.
  157. self._most_informative_features = sorted(
  158. features, key=lambda feature_: minprob[feature_] / maxprob[feature_]
  159. )
  160. return self._most_informative_features[:n]
  161. @classmethod
  162. def train(cls, labeled_featuresets, estimator=ELEProbDist):
  163. """
  164. :param labeled_featuresets: A list of classified featuresets,
  165. i.e., a list of tuples ``(featureset, label)``.
  166. """
  167. label_freqdist = FreqDist()
  168. feature_freqdist = defaultdict(FreqDist)
  169. feature_values = defaultdict(set)
  170. fnames = set()
  171. # Count up how many times each feature value occurred, given
  172. # the label and featurename.
  173. for featureset, label in labeled_featuresets:
  174. label_freqdist[label] += 1
  175. for fname, fval in featureset.items():
  176. # Increment freq(fval|label, fname)
  177. feature_freqdist[label, fname][fval] += 1
  178. # Record that fname can take the value fval.
  179. feature_values[fname].add(fval)
  180. # Keep a list of all feature names.
  181. fnames.add(fname)
  182. # If a feature didn't have a value given for an instance, then
  183. # we assume that it gets the implicit value 'None.' This loop
  184. # counts up the number of 'missing' feature values for each
  185. # (label,fname) pair, and increments the count of the fval
  186. # 'None' by that amount.
  187. for label in label_freqdist:
  188. num_samples = label_freqdist[label]
  189. for fname in fnames:
  190. count = feature_freqdist[label, fname].N()
  191. # Only add a None key when necessary, i.e. if there are
  192. # any samples with feature 'fname' missing.
  193. if num_samples - count > 0:
  194. feature_freqdist[label, fname][None] += num_samples - count
  195. feature_values[fname].add(None)
  196. # Create the P(label) distribution
  197. label_probdist = estimator(label_freqdist)
  198. # Create the P(fval|label, fname) distribution
  199. feature_probdist = {}
  200. for ((label, fname), freqdist) in feature_freqdist.items():
  201. probdist = estimator(freqdist, bins=len(feature_values[fname]))
  202. feature_probdist[label, fname] = probdist
  203. return cls(label_probdist, feature_probdist)
  204. ##//////////////////////////////////////////////////////
  205. ## Demo
  206. ##//////////////////////////////////////////////////////
  207. def demo():
  208. from nltk.classify.util import names_demo
  209. classifier = names_demo(NaiveBayesClassifier.train)
  210. classifier.show_most_informative_features()
  211. if __name__ == '__main__':
  212. demo()