123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- # Natural Language Toolkit: Decision Tree Classifiers
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A classifier model that decides which label to assign to a token on
- the basis of a tree structure, where branches correspond to conditions
- on feature values, and leaves correspond to label assignments.
- """
- from __future__ import print_function, unicode_literals, division
- from collections import defaultdict
- from nltk.probability import FreqDist, MLEProbDist, entropy
- from nltk.classify.api import ClassifierI
- from nltk.compat import python_2_unicode_compatible
- @python_2_unicode_compatible
- class DecisionTreeClassifier(ClassifierI):
- def __init__(self, label, feature_name=None, decisions=None, default=None):
- """
- :param label: The most likely label for tokens that reach
- this node in the decision tree. If this decision tree
- has no children, then this label will be assigned to
- any token that reaches this decision tree.
- :param feature_name: The name of the feature that this
- decision tree selects for.
- :param decisions: A dictionary mapping from feature values
- for the feature identified by ``feature_name`` to
- child decision trees.
- :param default: The child that will be used if the value of
- feature ``feature_name`` does not match any of the keys in
- ``decisions``. This is used when constructing binary
- decision trees.
- """
- self._label = label
- self._fname = feature_name
- self._decisions = decisions
- self._default = default
- def labels(self):
- labels = [self._label]
- if self._decisions is not None:
- for dt in self._decisions.values():
- labels.extend(dt.labels())
- if self._default is not None:
- labels.extend(self._default.labels())
- return list(set(labels))
- def classify(self, featureset):
- # Decision leaf:
- if self._fname is None:
- return self._label
- # Decision tree:
- fval = featureset.get(self._fname)
- if fval in self._decisions:
- return self._decisions[fval].classify(featureset)
- elif self._default is not None:
- return self._default.classify(featureset)
- else:
- return self._label
- def error(self, labeled_featuresets):
- errors = 0
- for featureset, label in labeled_featuresets:
- if self.classify(featureset) != label:
- errors += 1
- return errors / len(labeled_featuresets)
- def pretty_format(self, width=70, prefix='', depth=4):
- """
- Return a string containing a pretty-printed version of this
- decision tree. Each line in this string corresponds to a
- single decision tree node or leaf, and indentation is used to
- display the structure of the decision tree.
- """
- # [xx] display default!!
- if self._fname is None:
- n = width - len(prefix) - 15
- return '{0}{1} {2}\n'.format(prefix, '.' * n, self._label)
- s = ''
- for i, (fval, result) in enumerate(sorted(self._decisions.items())):
- hdr = '{0}{1}={2}? '.format(prefix, self._fname, fval)
- n = width - 15 - len(hdr)
- s += '{0}{1} {2}\n'.format(hdr, '.' * (n), result._label)
- if result._fname is not None and depth > 1:
- s += result.pretty_format(width, prefix + ' ', depth - 1)
- if self._default is not None:
- n = width - len(prefix) - 21
- s += '{0}else: {1} {2}\n'.format(prefix, '.' * n, self._default._label)
- if self._default._fname is not None and depth > 1:
- s += self._default.pretty_format(width, prefix + ' ', depth - 1)
- return s
- def pseudocode(self, prefix='', depth=4):
- """
- Return a string representation of this decision tree that
- expresses the decisions it makes as a nested set of pseudocode
- if statements.
- """
- if self._fname is None:
- return "{0}return {1!r}\n".format(prefix, self._label)
- s = ''
- for (fval, result) in sorted(self._decisions.items()):
- s += '{0}if {1} == {2!r}: '.format(prefix, self._fname, fval)
- if result._fname is not None and depth > 1:
- s += '\n' + result.pseudocode(prefix + ' ', depth - 1)
- else:
- s += 'return {0!r}\n'.format(result._label)
- if self._default is not None:
- if len(self._decisions) == 1:
- s += '{0}if {1} != {2!r}: '.format(
- prefix, self._fname, list(self._decisions.keys())[0]
- )
- else:
- s += '{0}else: '.format(prefix)
- if self._default._fname is not None and depth > 1:
- s += '\n' + self._default.pseudocode(prefix + ' ', depth - 1)
- else:
- s += 'return {0!r}\n'.format(self._default._label)
- return s
- def __str__(self):
- return self.pretty_format()
- @staticmethod
- def train(
- labeled_featuresets,
- entropy_cutoff=0.05,
- depth_cutoff=100,
- support_cutoff=10,
- binary=False,
- feature_values=None,
- verbose=False,
- ):
- """
- :param binary: If true, then treat all feature/value pairs as
- individual binary features, rather than using a single n-way
- branch for each feature.
- """
- # Collect a list of all feature names.
- feature_names = set()
- for featureset, label in labeled_featuresets:
- for fname in featureset:
- feature_names.add(fname)
- # Collect a list of the values each feature can take.
- if feature_values is None and binary:
- feature_values = defaultdict(set)
- for featureset, label in labeled_featuresets:
- for fname, fval in featureset.items():
- feature_values[fname].add(fval)
- # Start with a stump.
- if not binary:
- tree = DecisionTreeClassifier.best_stump(
- feature_names, labeled_featuresets, verbose
- )
- else:
- tree = DecisionTreeClassifier.best_binary_stump(
- feature_names, labeled_featuresets, feature_values, verbose
- )
- # Refine the stump.
- tree.refine(
- labeled_featuresets,
- entropy_cutoff,
- depth_cutoff - 1,
- support_cutoff,
- binary,
- feature_values,
- verbose,
- )
- # Return it
- return tree
- @staticmethod
- def leaf(labeled_featuresets):
- label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
- return DecisionTreeClassifier(label)
- @staticmethod
- def stump(feature_name, labeled_featuresets):
- label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
- # Find the best label for each value.
- freqs = defaultdict(FreqDist) # freq(label|value)
- for featureset, label in labeled_featuresets:
- feature_value = featureset.get(feature_name)
- freqs[feature_value][label] += 1
- decisions = dict(
- (val, DecisionTreeClassifier(freqs[val].max())) for val in freqs
- )
- return DecisionTreeClassifier(label, feature_name, decisions)
- def refine(
- self,
- labeled_featuresets,
- entropy_cutoff,
- depth_cutoff,
- support_cutoff,
- binary=False,
- feature_values=None,
- verbose=False,
- ):
- if len(labeled_featuresets) <= support_cutoff:
- return
- if self._fname is None:
- return
- if depth_cutoff <= 0:
- return
- for fval in self._decisions:
- fval_featuresets = [
- (featureset, label)
- for (featureset, label) in labeled_featuresets
- if featureset.get(self._fname) == fval
- ]
- label_freqs = FreqDist(label for (featureset, label) in fval_featuresets)
- if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
- self._decisions[fval] = DecisionTreeClassifier.train(
- fval_featuresets,
- entropy_cutoff,
- depth_cutoff,
- support_cutoff,
- binary,
- feature_values,
- verbose,
- )
- if self._default is not None:
- default_featuresets = [
- (featureset, label)
- for (featureset, label) in labeled_featuresets
- if featureset.get(self._fname) not in self._decisions
- ]
- label_freqs = FreqDist(label for (featureset, label) in default_featuresets)
- if entropy(MLEProbDist(label_freqs)) > entropy_cutoff:
- self._default = DecisionTreeClassifier.train(
- default_featuresets,
- entropy_cutoff,
- depth_cutoff,
- support_cutoff,
- binary,
- feature_values,
- verbose,
- )
- @staticmethod
- def best_stump(feature_names, labeled_featuresets, verbose=False):
- best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
- best_error = best_stump.error(labeled_featuresets)
- for fname in feature_names:
- stump = DecisionTreeClassifier.stump(fname, labeled_featuresets)
- stump_error = stump.error(labeled_featuresets)
- if stump_error < best_error:
- best_error = stump_error
- best_stump = stump
- if verbose:
- print(
- (
- 'best stump for {:6d} toks uses {:20} err={:6.4f}'.format(
- len(labeled_featuresets), best_stump._fname, best_error
- )
- )
- )
- return best_stump
- @staticmethod
- def binary_stump(feature_name, feature_value, labeled_featuresets):
- label = FreqDist(label for (featureset, label) in labeled_featuresets).max()
- # Find the best label for each value.
- pos_fdist = FreqDist()
- neg_fdist = FreqDist()
- for featureset, label in labeled_featuresets:
- if featureset.get(feature_name) == feature_value:
- pos_fdist[label] += 1
- else:
- neg_fdist[label] += 1
- decisions = {}
- default = label
- # But hopefully we have observations!
- if pos_fdist.N() > 0:
- decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())}
- if neg_fdist.N() > 0:
- default = DecisionTreeClassifier(neg_fdist.max())
- return DecisionTreeClassifier(label, feature_name, decisions, default)
- @staticmethod
- def best_binary_stump(
- feature_names, labeled_featuresets, feature_values, verbose=False
- ):
- best_stump = DecisionTreeClassifier.leaf(labeled_featuresets)
- best_error = best_stump.error(labeled_featuresets)
- for fname in feature_names:
- for fval in feature_values[fname]:
- stump = DecisionTreeClassifier.binary_stump(
- fname, fval, labeled_featuresets
- )
- stump_error = stump.error(labeled_featuresets)
- if stump_error < best_error:
- best_error = stump_error
- best_stump = stump
- if verbose:
- if best_stump._decisions:
- descr = '{0}={1}'.format(
- best_stump._fname, list(best_stump._decisions.keys())[0]
- )
- else:
- descr = '(default)'
- print(
- (
- 'best stump for {:6d} toks uses {:20} err={:6.4f}'.format(
- len(labeled_featuresets), descr, best_error
- )
- )
- )
- return best_stump
- ##//////////////////////////////////////////////////////
- ## Demo
- ##//////////////////////////////////////////////////////
- def f(x):
- return DecisionTreeClassifier.train(x, binary=True, verbose=True)
- def demo():
- from nltk.classify.util import names_demo, binary_names_demo_features
- classifier = names_demo(
- f, binary_names_demo_features # DecisionTreeClassifier.train,
- )
- print(classifier.pp(depth=7))
- print(classifier.pseudocode(depth=7))
- if __name__ == '__main__':
- demo()
|