positivenaivebayes.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. # Natural Language Toolkit: Positive Naive Bayes Classifier
  2. #
  3. # Copyright (C) 2012 NLTK Project
  4. # Author: Alessandro Presta <alessandro.presta@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A variant of the Naive Bayes Classifier that performs binary classification with
  9. partially-labeled training sets. In other words, assume we want to build a classifier
  10. that assigns each example to one of two complementary classes (e.g., male names and
  11. female names).
  12. If we have a training set with labeled examples for both classes, we can use a
  13. standard Naive Bayes Classifier. However, consider the case when we only have labeled
  14. examples for one of the classes, and other, unlabeled, examples.
  15. Then, assuming a prior distribution on the two labels, we can use the unlabeled set
  16. to estimate the frequencies of the various features.
  17. Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1
  18. and unlabeled examples. We are also given an estimate of P(1).
  19. We compute P(feature|1) exactly as in the standard case.
  20. To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are
  21. assuming that the unlabeled examples are drawn according to the given prior distribution)
  22. and then express the conditional probability as:
  23. | P(feature) - P(feature|1) * P(1)
  24. | P(feature|0) = ----------------------------------
  25. | P(0)
  26. Example:
  27. >>> from nltk.classify import PositiveNaiveBayesClassifier
  28. Some sentences about sports:
  29. >>> sports_sentences = [ 'The team dominated the game',
  30. ... 'They lost the ball',
  31. ... 'The game was intense',
  32. ... 'The goalkeeper catched the ball',
  33. ... 'The other team controlled the ball' ]
  34. Mixed topics, including sports:
  35. >>> various_sentences = [ 'The President did not comment',
  36. ... 'I lost the keys',
  37. ... 'The team won the game',
  38. ... 'Sara has two kids',
  39. ... 'The ball went off the court',
  40. ... 'They had the ball for the whole game',
  41. ... 'The show is over' ]
  42. The features of a sentence are simply the words it contains:
  43. >>> def features(sentence):
  44. ... words = sentence.lower().split()
  45. ... return dict(('contains(%s)' % w, True) for w in words)
  46. We use the sports sentences as positive examples, the mixed ones ad unlabeled examples:
  47. >>> positive_featuresets = map(features, sports_sentences)
  48. >>> unlabeled_featuresets = map(features, various_sentences)
  49. >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets,
  50. ... unlabeled_featuresets)
  51. Is the following sentence about sports?
  52. >>> classifier.classify(features('The cat is on the table'))
  53. False
  54. What about this one?
  55. >>> classifier.classify(features('My team lost the game'))
  56. True
  57. """
  58. from collections import defaultdict
  59. from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist
  60. from nltk.classify.naivebayes import NaiveBayesClassifier
  61. ##//////////////////////////////////////////////////////
  62. ## Positive Naive Bayes Classifier
  63. ##//////////////////////////////////////////////////////
  64. class PositiveNaiveBayesClassifier(NaiveBayesClassifier):
  65. @staticmethod
  66. def train(
  67. positive_featuresets,
  68. unlabeled_featuresets,
  69. positive_prob_prior=0.5,
  70. estimator=ELEProbDist,
  71. ):
  72. """
  73. :param positive_featuresets: An iterable of featuresets that are known as positive
  74. examples (i.e., their label is ``True``).
  75. :param unlabeled_featuresets: An iterable of featuresets whose label is unknown.
  76. :param positive_prob_prior: A prior estimate of the probability of the label
  77. ``True`` (default 0.5).
  78. """
  79. positive_feature_freqdist = defaultdict(FreqDist)
  80. unlabeled_feature_freqdist = defaultdict(FreqDist)
  81. feature_values = defaultdict(set)
  82. fnames = set()
  83. # Count up how many times each feature value occurred in positive examples.
  84. num_positive_examples = 0
  85. for featureset in positive_featuresets:
  86. for fname, fval in featureset.items():
  87. positive_feature_freqdist[fname][fval] += 1
  88. feature_values[fname].add(fval)
  89. fnames.add(fname)
  90. num_positive_examples += 1
  91. # Count up how many times each feature value occurred in unlabeled examples.
  92. num_unlabeled_examples = 0
  93. for featureset in unlabeled_featuresets:
  94. for fname, fval in featureset.items():
  95. unlabeled_feature_freqdist[fname][fval] += 1
  96. feature_values[fname].add(fval)
  97. fnames.add(fname)
  98. num_unlabeled_examples += 1
  99. # If a feature didn't have a value given for an instance, then we assume that
  100. # it gets the implicit value 'None'.
  101. for fname in fnames:
  102. count = positive_feature_freqdist[fname].N()
  103. positive_feature_freqdist[fname][None] += num_positive_examples - count
  104. feature_values[fname].add(None)
  105. for fname in fnames:
  106. count = unlabeled_feature_freqdist[fname].N()
  107. unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count
  108. feature_values[fname].add(None)
  109. negative_prob_prior = 1.0 - positive_prob_prior
  110. # Create the P(label) distribution.
  111. label_probdist = DictionaryProbDist(
  112. {True: positive_prob_prior, False: negative_prob_prior}
  113. )
  114. # Create the P(fval|label, fname) distribution.
  115. feature_probdist = {}
  116. for fname, freqdist in positive_feature_freqdist.items():
  117. probdist = estimator(freqdist, bins=len(feature_values[fname]))
  118. feature_probdist[True, fname] = probdist
  119. for fname, freqdist in unlabeled_feature_freqdist.items():
  120. global_probdist = estimator(freqdist, bins=len(feature_values[fname]))
  121. negative_feature_probs = {}
  122. for fval in feature_values[fname]:
  123. prob = (
  124. global_probdist.prob(fval)
  125. - positive_prob_prior * feature_probdist[True, fname].prob(fval)
  126. ) / negative_prob_prior
  127. # TODO: We need to add some kind of smoothing here, instead of
  128. # setting negative probabilities to zero and normalizing.
  129. negative_feature_probs[fval] = max(prob, 0.0)
  130. feature_probdist[False, fname] = DictionaryProbDist(
  131. negative_feature_probs, normalize=True
  132. )
  133. return PositiveNaiveBayesClassifier(label_probdist, feature_probdist)
  134. ##//////////////////////////////////////////////////////
  135. ## Demo
  136. ##//////////////////////////////////////////////////////
  137. def demo():
  138. from nltk.classify.util import partial_names_demo
  139. classifier = partial_names_demo(PositiveNaiveBayesClassifier.train)
  140. classifier.show_most_informative_features()