util.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. # Natural Language Toolkit: Classifier Utility Functions
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com> (minor additions)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Utility functions and classes for classifiers.
  10. """
  11. from __future__ import print_function, division
  12. import math
  13. # from nltk.util import Deprecated
  14. import nltk.classify.util # for accuracy & log_likelihood
  15. from nltk.util import LazyMap
  16. ######################################################################
  17. # { Helper Functions
  18. ######################################################################
  19. # alternative name possibility: 'map_featurefunc()'?
  20. # alternative name possibility: 'detect_features()'?
  21. # alternative name possibility: 'map_featuredetect()'?
  22. # or.. just have users use LazyMap directly?
  23. def apply_features(feature_func, toks, labeled=None):
  24. """
  25. Use the ``LazyMap`` class to construct a lazy list-like
  26. object that is analogous to ``map(feature_func, toks)``. In
  27. particular, if ``labeled=False``, then the returned list-like
  28. object's values are equal to::
  29. [feature_func(tok) for tok in toks]
  30. If ``labeled=True``, then the returned list-like object's values
  31. are equal to::
  32. [(feature_func(tok), label) for (tok, label) in toks]
  33. The primary purpose of this function is to avoid the memory
  34. overhead involved in storing all the featuresets for every token
  35. in a corpus. Instead, these featuresets are constructed lazily,
  36. as-needed. The reduction in memory overhead can be especially
  37. significant when the underlying list of tokens is itself lazy (as
  38. is the case with many corpus readers).
  39. :param feature_func: The function that will be applied to each
  40. token. It should return a featureset -- i.e., a dict
  41. mapping feature names to feature values.
  42. :param toks: The list of tokens to which ``feature_func`` should be
  43. applied. If ``labeled=True``, then the list elements will be
  44. passed directly to ``feature_func()``. If ``labeled=False``,
  45. then the list elements should be tuples ``(tok,label)``, and
  46. ``tok`` will be passed to ``feature_func()``.
  47. :param labeled: If true, then ``toks`` contains labeled tokens --
  48. i.e., tuples of the form ``(tok, label)``. (Default:
  49. auto-detect based on types.)
  50. """
  51. if labeled is None:
  52. labeled = toks and isinstance(toks[0], (tuple, list))
  53. if labeled:
  54. def lazy_func(labeled_token):
  55. return (feature_func(labeled_token[0]), labeled_token[1])
  56. return LazyMap(lazy_func, toks)
  57. else:
  58. return LazyMap(feature_func, toks)
  59. def attested_labels(tokens):
  60. """
  61. :return: A list of all labels that are attested in the given list
  62. of tokens.
  63. :rtype: list of (immutable)
  64. :param tokens: The list of classified tokens from which to extract
  65. labels. A classified token has the form ``(token, label)``.
  66. :type tokens: list
  67. """
  68. return tuple(set(label for (tok, label) in tokens))
  69. def log_likelihood(classifier, gold):
  70. results = classifier.prob_classify_many([fs for (fs, l) in gold])
  71. ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)]
  72. return math.log(sum(ll) / len(ll))
  73. def accuracy(classifier, gold):
  74. results = classifier.classify_many([fs for (fs, l) in gold])
  75. correct = [l == r for ((fs, l), r) in zip(gold, results)]
  76. if correct:
  77. return sum(correct) / len(correct)
  78. else:
  79. return 0
  80. class CutoffChecker(object):
  81. """
  82. A helper class that implements cutoff checks based on number of
  83. iterations and log likelihood.
  84. Accuracy cutoffs are also implemented, but they're almost never
  85. a good idea to use.
  86. """
  87. def __init__(self, cutoffs):
  88. self.cutoffs = cutoffs.copy()
  89. if 'min_ll' in cutoffs:
  90. cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
  91. if 'min_lldelta' in cutoffs:
  92. cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
  93. self.ll = None
  94. self.acc = None
  95. self.iter = 1
  96. def check(self, classifier, train_toks):
  97. cutoffs = self.cutoffs
  98. self.iter += 1
  99. if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
  100. return True # iteration cutoff.
  101. new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
  102. if math.isnan(new_ll):
  103. return True
  104. if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
  105. if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
  106. return True # log likelihood cutoff
  107. if (
  108. 'min_lldelta' in cutoffs
  109. and self.ll
  110. and ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))
  111. ):
  112. return True # log likelihood delta cutoff
  113. self.ll = new_ll
  114. if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
  115. new_acc = nltk.classify.util.log_likelihood(classifier, train_toks)
  116. if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
  117. return True # log likelihood cutoff
  118. if (
  119. 'min_accdelta' in cutoffs
  120. and self.acc
  121. and ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))
  122. ):
  123. return True # log likelihood delta cutoff
  124. self.acc = new_acc
  125. return False # no cutoff reached.
  126. ######################################################################
  127. # { Demos
  128. ######################################################################
  129. def names_demo_features(name):
  130. features = {}
  131. features['alwayson'] = True
  132. features['startswith'] = name[0].lower()
  133. features['endswith'] = name[-1].lower()
  134. for letter in 'abcdefghijklmnopqrstuvwxyz':
  135. features['count(%s)' % letter] = name.lower().count(letter)
  136. features['has(%s)' % letter] = letter in name.lower()
  137. return features
  138. def binary_names_demo_features(name):
  139. features = {}
  140. features['alwayson'] = True
  141. features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
  142. features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
  143. for letter in 'abcdefghijklmnopqrstuvwxyz':
  144. features['count(%s)' % letter] = name.lower().count(letter)
  145. features['has(%s)' % letter] = letter in name.lower()
  146. features['startswith(%s)' % letter] = letter == name[0].lower()
  147. features['endswith(%s)' % letter] = letter == name[-1].lower()
  148. return features
  149. def names_demo(trainer, features=names_demo_features):
  150. from nltk.corpus import names
  151. import random
  152. # Construct a list of classified names, using the names corpus.
  153. namelist = [(name, 'male') for name in names.words('male.txt')] + [
  154. (name, 'female') for name in names.words('female.txt')
  155. ]
  156. # Randomly split the names into a test & train set.
  157. random.seed(123456)
  158. random.shuffle(namelist)
  159. train = namelist[:5000]
  160. test = namelist[5000:5500]
  161. # Train up a classifier.
  162. print('Training classifier...')
  163. classifier = trainer([(features(n), g) for (n, g) in train])
  164. # Run the classifier on the test data.
  165. print('Testing classifier...')
  166. acc = accuracy(classifier, [(features(n), g) for (n, g) in test])
  167. print('Accuracy: %6.4f' % acc)
  168. # For classifiers that can find probabilities, show the log
  169. # likelihood and some sample probability distributions.
  170. try:
  171. test_featuresets = [features(n) for (n, g) in test]
  172. pdists = classifier.prob_classify_many(test_featuresets)
  173. ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
  174. print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
  175. print()
  176. print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
  177. for ((name, gender), pdist) in list(zip(test, pdists))[:5]:
  178. if gender == 'male':
  179. fmt = ' %-15s *%6.4f %6.4f'
  180. else:
  181. fmt = ' %-15s %6.4f *%6.4f'
  182. print(fmt % (name, pdist.prob('male'), pdist.prob('female')))
  183. except NotImplementedError:
  184. pass
  185. # Return the classifier
  186. return classifier
  187. def partial_names_demo(trainer, features=names_demo_features):
  188. from nltk.corpus import names
  189. import random
  190. male_names = names.words('male.txt')
  191. female_names = names.words('female.txt')
  192. random.seed(654321)
  193. random.shuffle(male_names)
  194. random.shuffle(female_names)
  195. # Create a list of male names to be used as positive-labeled examples for training
  196. positive = map(features, male_names[:2000])
  197. # Create a list of male and female names to be used as unlabeled examples
  198. unlabeled = map(features, male_names[2000:2500] + female_names[:500])
  199. # Create a test set with correctly-labeled male and female names
  200. test = [(name, True) for name in male_names[2500:2750]] + [
  201. (name, False) for name in female_names[500:750]
  202. ]
  203. random.shuffle(test)
  204. # Train up a classifier.
  205. print('Training classifier...')
  206. classifier = trainer(positive, unlabeled)
  207. # Run the classifier on the test data.
  208. print('Testing classifier...')
  209. acc = accuracy(classifier, [(features(n), m) for (n, m) in test])
  210. print('Accuracy: %6.4f' % acc)
  211. # For classifiers that can find probabilities, show the log
  212. # likelihood and some sample probability distributions.
  213. try:
  214. test_featuresets = [features(n) for (n, m) in test]
  215. pdists = classifier.prob_classify_many(test_featuresets)
  216. ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
  217. print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
  218. print()
  219. print('Unseen Names P(Male) P(Female)\n' + '-' * 40)
  220. for ((name, is_male), pdist) in zip(test, pdists)[:5]:
  221. if is_male == True:
  222. fmt = ' %-15s *%6.4f %6.4f'
  223. else:
  224. fmt = ' %-15s %6.4f *%6.4f'
  225. print(fmt % (name, pdist.prob(True), pdist.prob(False)))
  226. except NotImplementedError:
  227. pass
  228. # Return the classifier
  229. return classifier
  230. _inst_cache = {}
  231. def wsd_demo(trainer, word, features, n=1000):
  232. from nltk.corpus import senseval
  233. import random
  234. # Get the instances.
  235. print('Reading data...')
  236. global _inst_cache
  237. if word not in _inst_cache:
  238. _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
  239. instances = _inst_cache[word][:]
  240. if n > len(instances):
  241. n = len(instances)
  242. senses = list(set(l for (i, l) in instances))
  243. print(' Senses: ' + ' '.join(senses))
  244. # Randomly split the names into a test & train set.
  245. print('Splitting into test & train...')
  246. random.seed(123456)
  247. random.shuffle(instances)
  248. train = instances[: int(0.8 * n)]
  249. test = instances[int(0.8 * n) : n]
  250. # Train up a classifier.
  251. print('Training classifier...')
  252. classifier = trainer([(features(i), l) for (i, l) in train])
  253. # Run the classifier on the test data.
  254. print('Testing classifier...')
  255. acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
  256. print('Accuracy: %6.4f' % acc)
  257. # For classifiers that can find probabilities, show the log
  258. # likelihood and some sample probability distributions.
  259. try:
  260. test_featuresets = [features(i) for (i, n) in test]
  261. pdists = classifier.prob_classify_many(test_featuresets)
  262. ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)]
  263. print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
  264. except NotImplementedError:
  265. pass
  266. # Return the classifier
  267. return classifier
  268. def check_megam_config():
  269. """
  270. Checks whether the MEGAM binary is configured.
  271. """
  272. try:
  273. _megam_bin
  274. except NameError:
  275. err_msg = str(
  276. "Please configure your megam binary first, e.g.\n"
  277. ">>> nltk.config_megam('/usr/bin/local/megam')"
  278. )
  279. raise NameError(err_msg)