probability.doctest 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. .. Copyright (C) 2001-2019 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. ===========
  4. Probability
  5. ===========
  6. >>> import nltk
  7. >>> from nltk.probability import *
  8. FreqDist
  9. --------
  10. >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!']
  11. >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.']
  12. >>> fd1 = nltk.FreqDist(text1)
  13. >>> fd1 == nltk.FreqDist(text1)
  14. True
  15. Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order.
  16. >>> import itertools
  17. >>> both = nltk.FreqDist(text1 + text2)
  18. >>> both_most_common = both.most_common()
  19. >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1]))))
  20. [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)]
  21. >>> both == fd1 + nltk.FreqDist(text2)
  22. True
  23. >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged
  24. True
  25. >>> fd2 = nltk.FreqDist(text2)
  26. >>> fd1.update(fd2)
  27. >>> fd1 == both
  28. True
  29. >>> fd1 = nltk.FreqDist(text1)
  30. >>> fd1.update(text2)
  31. >>> fd1 == both
  32. True
  33. >>> fd1 = nltk.FreqDist(text1)
  34. >>> fd2 = nltk.FreqDist(fd1)
  35. >>> fd2 == fd1
  36. True
  37. ``nltk.FreqDist`` can be pickled:
  38. >>> import pickle
  39. >>> fd1 = nltk.FreqDist(text1)
  40. >>> pickled = pickle.dumps(fd1)
  41. >>> fd1 == pickle.loads(pickled)
  42. True
  43. Mathematical operations:
  44. >>> FreqDist('abbb') + FreqDist('bcc')
  45. FreqDist({'b': 4, 'c': 2, 'a': 1})
  46. >>> FreqDist('abbbc') - FreqDist('bccd')
  47. FreqDist({'b': 2, 'a': 1})
  48. >>> FreqDist('abbb') | FreqDist('bcc')
  49. FreqDist({'b': 3, 'c': 2, 'a': 1})
  50. >>> FreqDist('abbb') & FreqDist('bcc')
  51. FreqDist({'b': 1})
  52. ConditionalFreqDist
  53. -------------------
  54. >>> cfd1 = ConditionalFreqDist()
  55. >>> cfd1[1] = FreqDist('abbbb')
  56. >>> cfd1[2] = FreqDist('xxxxyy')
  57. >>> cfd1
  58. <ConditionalFreqDist with 2 conditions>
  59. >>> cfd2 = ConditionalFreqDist()
  60. >>> cfd2[1] = FreqDist('bbccc')
  61. >>> cfd2[2] = FreqDist('xxxyyyzz')
  62. >>> cfd2[3] = FreqDist('m')
  63. >>> cfd2
  64. <ConditionalFreqDist with 3 conditions>
  65. >>> r = cfd1 + cfd2
  66. >>> [(i,r[i]) for i in r.conditions()]
  67. [(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))]
  68. >>> r = cfd1 - cfd2
  69. >>> [(i,r[i]) for i in r.conditions()]
  70. [(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))]
  71. >>> r = cfd1 | cfd2
  72. >>> [(i,r[i]) for i in r.conditions()]
  73. [(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))]
  74. >>> r = cfd1 & cfd2
  75. >>> [(i,r[i]) for i in r.conditions()]
  76. [(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))]
  77. Testing some HMM estimators
  78. ---------------------------
  79. We extract a small part (500 sentences) of the Brown corpus
  80. >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
  81. >>> print(len(corpus))
  82. 500
  83. We create a HMM trainer - note that we need the tags and symbols
  84. from the whole corpus, not just the training corpus
  85. >>> from nltk.util import unique_list
  86. >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
  87. >>> print(len(tag_set))
  88. 92
  89. >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
  90. >>> print(len(symbols))
  91. 1464
  92. >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
  93. We divide the corpus into 90% training and 10% testing
  94. >>> train_corpus = []
  95. >>> test_corpus = []
  96. >>> for i in range(len(corpus)):
  97. ... if i % 10:
  98. ... train_corpus += [corpus[i]]
  99. ... else:
  100. ... test_corpus += [corpus[i]]
  101. >>> print(len(train_corpus))
  102. 450
  103. >>> print(len(test_corpus))
  104. 50
  105. And now we can test the estimators
  106. >>> def train_and_test(est):
  107. ... hmm = trainer.train_supervised(train_corpus, estimator=est)
  108. ... print('%.2f%%' % (100 * hmm.evaluate(test_corpus)))
  109. Maximum Likelihood Estimation
  110. -----------------------------
  111. - this resulted in an initialization error before r7209
  112. >>> mle = lambda fd, bins: MLEProbDist(fd)
  113. >>> train_and_test(mle)
  114. 22.75%
  115. Laplace (= Lidstone with gamma==1)
  116. >>> train_and_test(LaplaceProbDist)
  117. 66.04%
  118. Expected Likelihood Estimation (= Lidstone with gamma==0.5)
  119. >>> train_and_test(ELEProbDist)
  120. 73.01%
  121. Lidstone Estimation, for gamma==0.1, 0.5 and 1
  122. (the later two should be exactly equal to MLE and ELE above)
  123. >>> def lidstone(gamma):
  124. ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins)
  125. >>> train_and_test(lidstone(0.1))
  126. 82.51%
  127. >>> train_and_test(lidstone(0.5))
  128. 73.01%
  129. >>> train_and_test(lidstone(1.0))
  130. 66.04%
  131. Witten Bell Estimation
  132. ----------------------
  133. - This resulted in ZeroDivisionError before r7209
  134. >>> train_and_test(WittenBellProbDist)
  135. 88.12%
  136. Good Turing Estimation
  137. >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5)
  138. >>> train_and_test(gt)
  139. 86.93%
  140. Kneser Ney Estimation
  141. ---------------------
  142. Since the Kneser-Ney distribution is best suited for trigrams, we must adjust
  143. our testing accordingly.
  144. >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1]))
  145. ... for x, y, z in nltk.trigrams(sent)]
  146. ... for sent in corpus[:100]]
  147. We will then need to redefine the rest of the training/testing variables
  148. >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
  149. >>> len(tag_set)
  150. 906
  151. >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
  152. >>> len(symbols)
  153. 1341
  154. >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
  155. >>> train_corpus = []
  156. >>> test_corpus = []
  157. >>> for i in range(len(corpus)):
  158. ... if i % 10:
  159. ... train_corpus += [corpus[i]]
  160. ... else:
  161. ... test_corpus += [corpus[i]]
  162. >>> len(train_corpus)
  163. 90
  164. >>> len(test_corpus)
  165. 10
  166. >>> kn = lambda fd, bins: KneserNeyProbDist(fd)
  167. >>> train_and_test(kn)
  168. 0.86%
  169. Remains to be added:
  170. - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist
  171. Squashed bugs
  172. -------------
  173. Issue 511: override pop and popitem to invalidate the cache
  174. >>> fd = nltk.FreqDist('a')
  175. >>> list(fd.keys())
  176. ['a']
  177. >>> fd.pop('a')
  178. 1
  179. >>> list(fd.keys())
  180. []
  181. Issue 533: access cumulative frequencies with no arguments
  182. >>> fd = nltk.FreqDist('aab')
  183. >>> list(fd._cumulative_frequencies(['a']))
  184. [2.0]
  185. >>> list(fd._cumulative_frequencies(['a', 'b']))
  186. [2.0, 3.0]
  187. Issue 579: override clear to reset some variables
  188. >>> fd = FreqDist('aab')
  189. >>> fd.clear()
  190. >>> fd.N()
  191. 0
  192. Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently
  193. add errant categories
  194. >>> from nltk.corpus import brown
  195. >>> brown.fileids('blah')
  196. Traceback (most recent call last):
  197. ...
  198. ValueError: Category blah not found
  199. >>> brown.categories()
  200. ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
  201. Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default
  202. otherwise any unseen events get a probability of zero, i.e.,
  203. they don't get smoothed
  204. >>> from nltk import SimpleGoodTuringProbDist, FreqDist
  205. >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10})
  206. >>> p = SimpleGoodTuringProbDist(fd)
  207. >>> p.prob('a')
  208. 0.017649766667026317...
  209. >>> p.prob('o')
  210. 0.08433050215340411...
  211. >>> p.prob('z')
  212. 0.022727272727272728...
  213. >>> p.prob('foobar')
  214. 0.022727272727272728...
  215. ``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and
  216. ``ConditionalFreqDist`` can be pickled:
  217. >>> import pickle
  218. >>> pd = MLEProbDist(fd)
  219. >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples())
  220. True
  221. >>> dpd = DictionaryConditionalProbDist({'x': pd})
  222. >>> unpickled = pickle.loads(pickle.dumps(dpd))
  223. >>> dpd['x'].prob('a')
  224. 0.011363636...
  225. >>> dpd['x'].prob('a') == unpickled['x'].prob('a')
  226. True
  227. >>> cfd = nltk.probability.ConditionalFreqDist()
  228. >>> cfd['foo']['hello'] += 1
  229. >>> cfd['foo']['hello'] += 1
  230. >>> cfd['bar']['hello'] += 1
  231. >>> cfd2 = pickle.loads(pickle.dumps(cfd))
  232. >>> cfd2 == cfd
  233. True
  234. >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
  235. >>> cpd2 = pickle.loads(pickle.dumps(cpd))
  236. >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello')
  237. True