123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345 |
- # Natural Language Toolkit: Classifier Utility Functions
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com> (minor additions)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Utility functions and classes for classifiers.
- """
- from __future__ import print_function, division
- import math
- # from nltk.util import Deprecated
- import nltk.classify.util # for accuracy & log_likelihood
- from nltk.util import LazyMap
- ######################################################################
- # { Helper Functions
- ######################################################################
- # alternative name possibility: 'map_featurefunc()'?
- # alternative name possibility: 'detect_features()'?
- # alternative name possibility: 'map_featuredetect()'?
- # or.. just have users use LazyMap directly?
- def apply_features(feature_func, toks, labeled=None):
- """
- Use the ``LazyMap`` class to construct a lazy list-like
- object that is analogous to ``map(feature_func, toks)``. In
- particular, if ``labeled=False``, then the returned list-like
- object's values are equal to::
- [feature_func(tok) for tok in toks]
- If ``labeled=True``, then the returned list-like object's values
- are equal to::
- [(feature_func(tok), label) for (tok, label) in toks]
- The primary purpose of this function is to avoid the memory
- overhead involved in storing all the featuresets for every token
- in a corpus. Instead, these featuresets are constructed lazily,
- as-needed. The reduction in memory overhead can be especially
- significant when the underlying list of tokens is itself lazy (as
- is the case with many corpus readers).
- :param feature_func: The function that will be applied to each
- token. It should return a featureset -- i.e., a dict
- mapping feature names to feature values.
- :param toks: The list of tokens to which ``feature_func`` should be
- applied. If ``labeled=True``, then the list elements will be
- passed directly to ``feature_func()``. If ``labeled=False``,
- then the list elements should be tuples ``(tok,label)``, and
- ``tok`` will be passed to ``feature_func()``.
- :param labeled: If true, then ``toks`` contains labeled tokens --
- i.e., tuples of the form ``(tok, label)``. (Default:
- auto-detect based on types.)
- """
- if labeled is None:
- labeled = toks and isinstance(toks[0], (tuple, list))
- if labeled:
- def lazy_func(labeled_token):
- return (feature_func(labeled_token[0]), labeled_token[1])
- return LazyMap(lazy_func, toks)
- else:
- return LazyMap(feature_func, toks)
- def attested_labels(tokens):
- """
- :return: A list of all labels that are attested in the given list
- of tokens.
- :rtype: list of (immutable)
- :param tokens: The list of classified tokens from which to extract
- labels. A classified token has the form ``(token, label)``.
- :type tokens: list
- """
- return tuple(set(label for (tok, label) in tokens))
- def log_likelihood(classifier, gold):
- results = classifier.prob_classify_many([fs for (fs, l) in gold])
- ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
- return math.log(sum(ll) / len(ll))
- def accuracy(classifier, gold):
- results = classifier.classify_many([fs for (fs, l) in gold])
- correct = [l == r for ((fs, l), r) in zip(gold, results)]
- if correct:
- return sum(correct) / len(correct)
- else:
- return 0
- class CutoffChecker(object):
- """
- A helper class that implements cutoff checks based on number of
- iterations and log likelihood.
- Accuracy cutoffs are also implemented, but they're almost never
- a good idea to use.
- """
- def __init__(self, cutoffs):
- self.cutoffs = cutoffs.copy()
- if 'min_ll' in cutoffs:
- cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
- if 'min_lldelta' in cutoffs:
- cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
- self.ll = None
- self.acc = None
- self.iter = 1
- def check(self, classifier, train_toks):
- cutoffs = self.cutoffs
- self.iter += 1
- if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
- return True # iteration cutoff.
- new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
- if math.isnan(new_ll):
- return True
- if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
- if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
- return True # log likelihood cutoff
- if (
- 'min_lldelta' in cutoffs
- and self.ll
- and ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))
- ):
- return True # log likelihood delta cutoff
- self.ll = new_ll
- if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
- new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
- if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
- return True # log likelihood cutoff
- if (
- 'min_accdelta' in cutoffs
- and self.acc
- and ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))
- ):
- return True # log likelihood delta cutoff
- self.acc = new_acc
- return False # no cutoff reached.
- ######################################################################
- # { Demos
- ######################################################################
- def names_demo_features(name):
- features = {}
- features['alwayson'] = True
- features['startswith'] = name[0].lower()
- features['endswith'] = name[-1].lower()
- for letter in 'abcdefghijklmnopqrstuvwxyz':
- features['count(%s)' % letter] = name.lower().count(letter)
- features['has(%s)' % letter] = letter in name.lower()
- return features
- def binary_names_demo_features(name):
- features = {}
- features['alwayson'] = True
- features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
- features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
- for letter in 'abcdefghijklmnopqrstuvwxyz':
- features['count(%s)' % letter] = name.lower().count(letter)
- features['has(%s)' % letter] = letter in name.lower()
- features['startswith(%s)' % letter] = letter == name[0].lower()
- features['endswith(%s)' % letter] = letter == name[-1].lower()
- return features
- def names_demo(trainer, features=names_demo_features):
- from nltk.corpus import names
- import random
- # Construct a list of classified names, using the names corpus.
- namelist = [(name, 'male') for name in names.words('male.txt')] + [
- (name, 'female') for name in names.words('female.txt')
- ]
- # Randomly split the names into a test & train set.
- random.seed(123456)
- random.shuffle(namelist)
- train = namelist[:5000]
- test = namelist[5000:5500]
- # Train up a classifier.
- print('Training classifier...')
- classifier = trainer([(features(n), g) for (n, g) in train])
- # Run the classifier on the test data.
- print('Testing classifier...')
- acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
- print('Accuracy: %6.4f' % acc)
- # For classifiers that can find probabilities, show the log
- # likelihood and some sample probability distributions.
- try:
- test_featuresets = [features(n) for (n, g) in test]
- pdists = classifier.prob_classify_many(test_featuresets)
- ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
- print()
- print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
- for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
- if gender == 'male':
- fmt = ' %-15s *%6.4f %6.4f'
- else:
- fmt = ' %-15s %6.4f *%6.4f'
- print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
- except NotImplementedError:
- pass
- # Return the classifier
- return classifier
- def partial_names_demo(trainer, features=names_demo_features):
- from nltk.corpus import names
- import random
- male_names = names.words('male.txt')
- female_names = names.words('female.txt')
- random.seed(654321)
- random.shuffle(male_names)
- random.shuffle(female_names)
- # Create a list of male names to be used as positive-labeled examples for training
- positive = map(features, male_names[:2000])
- # Create a list of male and female names to be used as unlabeled examples
- unlabeled = map(features, male_names[2000:2500] + female_names[:500])
- # Create a test set with correctly-labeled male and female names
- test = [(name, True) for name in male_names[2500:2750]] + [
- (name, False) for name in female_names[500:750]
- ]
- random.shuffle(test)
- # Train up a classifier.
- print('Training classifier...')
- classifier = trainer(positive, unlabeled)
- # Run the classifier on the test data.
- print('Testing classifier...')
- acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
- print('Accuracy: %6.4f' % acc)
- # For classifiers that can find probabilities, show the log
- # likelihood and some sample probability distributions.
- try:
- test_featuresets = [features(n) for (n, m) in test]
- pdists = classifier.prob_classify_many(test_featuresets)
- ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
- print()
- print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
- for ((name, is_male), pdist) in zip(test, pdists)[:5]:
- if is_male == True:
- fmt = ' %-15s *%6.4f %6.4f'
- else:
- fmt = ' %-15s %6.4f *%6.4f'
- print(fmt % (name, pdist.prob(True), pdist.prob(False)))
- except NotImplementedError:
- pass
- # Return the classifier
- return classifier
- _inst_cache = {}
- def wsd_demo(trainer, word, features, n=1000):
- from nltk.corpus import senseval
- import random
- # Get the instances.
- print('Reading data...')
- global _inst_cache
- if word not in _inst_cache:
- _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
- instances = _inst_cache[word][:]
- if n > len(instances):
- n = len(instances)
- senses = list(set(l for (i, l) in instances))
- print(' Senses: ' + ' '.join(senses))
- # Randomly split the names into a test & train set.
- print('Splitting into test & train...')
- random.seed(123456)
- random.shuffle(instances)
- train = instances[: int(0.8 * n)]
- test = instances[int(0.8 * n) : n]
- # Train up a classifier.
- print('Training classifier...')
- classifier = trainer([(features(i), l) for (i, l) in train])
- # Run the classifier on the test data.
- print('Testing classifier...')
- acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
- print('Accuracy: %6.4f' % acc)
- # For classifiers that can find probabilities, show the log
- # likelihood and some sample probability distributions.
- try:
- test_featuresets = [features(i) for (i, n) in test]
- pdists = classifier.prob_classify_many(test_featuresets)
- ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
- print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
- except NotImplementedError:
- pass
- # Return the classifier
- return classifier
- def check_megam_config():
- """
- Checks whether the MEGAM binary is configured.
- """
- try:
- _megam_bin
- except NameError:
- err_msg = str(
- "Please configure your megam binary first, e.g.\n"
- ">>> nltk.config_megam('/usr/bin/local/megam')"
- )
- raise NameError(err_msg)
|