123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- # Natural Language Toolkit: Naive Bayes Classifiers
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A classifier based on the Naive Bayes algorithm. In order to find the
- probability for a label, this algorithm first uses the Bayes rule to
- express P(label|features) in terms of P(label) and P(features|label):
- | P(label) * P(features|label)
- | P(label|features) = ------------------------------
- | P(features)
- The algorithm then makes the 'naive' assumption that all features are
- independent, given the label:
- | P(label) * P(f1|label) * ... * P(fn|label)
- | P(label|features) = --------------------------------------------
- | P(features)
- Rather than computing P(features) explicitly, the algorithm just
- calculates the numerator for each label, and normalizes them so they
- sum to one:
- | P(label) * P(f1|label) * ... * P(fn|label)
- | P(label|features) = --------------------------------------------
- | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
- """
- from __future__ import print_function, unicode_literals
- from collections import defaultdict
- from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs
- from nltk.classify.api import ClassifierI
- ##//////////////////////////////////////////////////////
- ## Naive Bayes Classifier
- ##//////////////////////////////////////////////////////
- class NaiveBayesClassifier(ClassifierI):
- """
- A Naive Bayes classifier. Naive Bayes classifiers are
- paramaterized by two probability distributions:
- - P(label) gives the probability that an input will receive each
- label, given no information about the input's features.
- - P(fname=fval|label) gives the probability that a given feature
- (fname) will receive a given value (fval), given that the
- label (label).
- If the classifier encounters an input with a feature that has
- never been seen with any label, then rather than assigning a
- probability of 0 to all labels, it will ignore that feature.
- The feature value 'None' is reserved for unseen feature values;
- you generally should not use 'None' as a feature value for one of
- your own features.
- """
- def __init__(self, label_probdist, feature_probdist):
- """
- :param label_probdist: P(label), the probability distribution
- over labels. It is expressed as a ``ProbDistI`` whose
- samples are labels. I.e., P(label) =
- ``label_probdist.prob(label)``.
- :param feature_probdist: P(fname=fval|label), the probability
- distribution for feature values, given labels. It is
- expressed as a dictionary whose keys are ``(label, fname)``
- pairs and whose values are ``ProbDistI`` objects over feature
- values. I.e., P(fname=fval|label) =
- ``feature_probdist[label,fname].prob(fval)``. If a given
- ``(label,fname)`` is not a key in ``feature_probdist``, then
- it is assumed that the corresponding P(fname=fval|label)
- is 0 for all values of ``fval``.
- """
- self._label_probdist = label_probdist
- self._feature_probdist = feature_probdist
- self._labels = list(label_probdist.samples())
- def labels(self):
- return self._labels
- def classify(self, featureset):
- return self.prob_classify(featureset).max()
- def prob_classify(self, featureset):
- # Discard any feature names that we've never seen before.
- # Otherwise, we'll just assign a probability of 0 to
- # everything.
- featureset = featureset.copy()
- for fname in list(featureset.keys()):
- for label in self._labels:
- if (label, fname) in self._feature_probdist:
- break
- else:
- # print 'Ignoring unseen feature %s' % fname
- del featureset[fname]
- # Find the log probabilty of each label, given the features.
- # Start with the log probability of the label itself.
- logprob = {}
- for label in self._labels:
- logprob[label] = self._label_probdist.logprob(label)
- # Then add in the log probability of features given labels.
- for label in self._labels:
- for (fname, fval) in featureset.items():
- if (label, fname) in self._feature_probdist:
- feature_probs = self._feature_probdist[label, fname]
- logprob[label] += feature_probs.logprob(fval)
- else:
- # nb: This case will never come up if the
- # classifier was created by
- # NaiveBayesClassifier.train().
- logprob[label] += sum_logs([]) # = -INF.
- return DictionaryProbDist(logprob, normalize=True, log=True)
- def show_most_informative_features(self, n=10):
- # Determine the most relevant features, and display them.
- cpdist = self._feature_probdist
- print('Most Informative Features')
- for (fname, fval) in self.most_informative_features(n):
- def labelprob(l):
- return cpdist[l, fname].prob(fval)
- labels = sorted(
- [l for l in self._labels if fval in cpdist[l, fname].samples()],
- key=labelprob,
- )
- if len(labels) == 1:
- continue
- l0 = labels[0]
- l1 = labels[-1]
- if cpdist[l0, fname].prob(fval) == 0:
- ratio = 'INF'
- else:
- ratio = '%8.1f' % (
- cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)
- )
- print(
- (
- '%24s = %-14r %6s : %-6s = %s : 1.0'
- % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio)
- )
- )
- def most_informative_features(self, n=100):
- """
- Return a list of the 'most informative' features used by this
- classifier. For the purpose of this function, the
- informativeness of a feature ``(fname,fval)`` is equal to the
- highest value of P(fname=fval|label), for any label, divided by
- the lowest value of P(fname=fval|label), for any label:
- | max[ P(fname=fval|label1) / P(fname=fval|label2) ]
- """
- if hasattr(self, '_most_informative_features'):
- return self._most_informative_features[:n]
- else:
- # The set of (fname, fval) pairs used by this classifier.
- features = set()
- # The max & min probability associated w/ each (fname, fval)
- # pair. Maps (fname,fval) -> float.
- maxprob = defaultdict(lambda: 0.0)
- minprob = defaultdict(lambda: 1.0)
- for (label, fname), probdist in self._feature_probdist.items():
- for fval in probdist.samples():
- feature = (fname, fval)
- features.add(feature)
- p = probdist.prob(fval)
- maxprob[feature] = max(p, maxprob[feature])
- minprob[feature] = min(p, minprob[feature])
- if minprob[feature] == 0:
- features.discard(feature)
- # Convert features to a list, & sort it by how informative
- # features are.
- self._most_informative_features = sorted(
- features, key=lambda feature_: minprob[feature_] / maxprob[feature_]
- )
- return self._most_informative_features[:n]
- @classmethod
- def train(cls, labeled_featuresets, estimator=ELEProbDist):
- """
- :param labeled_featuresets: A list of classified featuresets,
- i.e., a list of tuples ``(featureset, label)``.
- """
- label_freqdist = FreqDist()
- feature_freqdist = defaultdict(FreqDist)
- feature_values = defaultdict(set)
- fnames = set()
- # Count up how many times each feature value occurred, given
- # the label and featurename.
- for featureset, label in labeled_featuresets:
- label_freqdist[label] += 1
- for fname, fval in featureset.items():
- # Increment freq(fval|label, fname)
- feature_freqdist[label, fname][fval] += 1
- # Record that fname can take the value fval.
- feature_values[fname].add(fval)
- # Keep a list of all feature names.
- fnames.add(fname)
- # If a feature didn't have a value given for an instance, then
- # we assume that it gets the implicit value 'None.' This loop
- # counts up the number of 'missing' feature values for each
- # (label,fname) pair, and increments the count of the fval
- # 'None' by that amount.
- for label in label_freqdist:
- num_samples = label_freqdist[label]
- for fname in fnames:
- count = feature_freqdist[label, fname].N()
- # Only add a None key when necessary, i.e. if there are
- # any samples with feature 'fname' missing.
- if num_samples - count > 0:
- feature_freqdist[label, fname][None] += num_samples - count
- feature_values[fname].add(None)
- # Create the P(label) distribution
- label_probdist = estimator(label_freqdist)
- # Create the P(fval|label, fname) distribution
- feature_probdist = {}
- for ((label, fname), freqdist) in feature_freqdist.items():
- probdist = estimator(freqdist, bins=len(feature_values[fname]))
- feature_probdist[label, fname] = probdist
- return cls(label_probdist, feature_probdist)
- ##//////////////////////////////////////////////////////
- ## Demo
- ##//////////////////////////////////////////////////////
- def demo():
- from nltk.classify.util import names_demo
- classifier = names_demo(NaiveBayesClassifier.train)
- classifier.show_most_informative_features()
- if __name__ == '__main__':
- demo()
|