metrics.doctest 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. .. Copyright (C) 2001-2019 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. =======
  4. Metrics
  5. =======
  6. The `nltk.metrics` package provides a variety of *evaluation measures*
  7. which can be used for a wide variety of NLP tasks.
  8. >>> from __future__ import print_function
  9. >>> from nltk.metrics import *
  10. ------------------
  11. Standard IR Scores
  12. ------------------
  13. We can use standard scores from information retrieval to test the
  14. performance of taggers, chunkers, etc.
  15. >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
  16. >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
  17. >>> print(accuracy(reference, test))
  18. 0.8
  19. The following measures apply to sets:
  20. >>> reference_set = set(reference)
  21. >>> test_set = set(test)
  22. >>> precision(reference_set, test_set)
  23. 1.0
  24. >>> print(recall(reference_set, test_set))
  25. 0.8
  26. >>> print(f_measure(reference_set, test_set))
  27. 0.88888888888...
  28. Measuring the likelihood of the data, given probability distributions:
  29. >>> from nltk import FreqDist, MLEProbDist
  30. >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf"))
  31. >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss"))
  32. >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2]))
  33. -2.7075187496...
  34. ----------------
  35. Distance Metrics
  36. ----------------
  37. String edit distance (Levenshtein):
  38. >>> edit_distance("rain", "shine")
  39. 3
  40. >>> edit_distance_align("shine", "shine")
  41. [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
  42. >>> edit_distance_align("rain", "brainy")
  43. [(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)]
  44. >>> edit_distance_align("", "brainy")
  45. [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)]
  46. >>> edit_distance_align("", "")
  47. [(0, 0)]
  48. Other distance measures:
  49. >>> s1 = set([1,2,3,4])
  50. >>> s2 = set([3,4,5])
  51. >>> binary_distance(s1, s2)
  52. 1.0
  53. >>> print(jaccard_distance(s1, s2))
  54. 0.6
  55. >>> print(masi_distance(s1, s2))
  56. 0.868
  57. ----------------------
  58. Miscellaneous Measures
  59. ----------------------
  60. Rank Correlation works with two dictionaries mapping keys to ranks.
  61. The dictionaries should have the same set of keys.
  62. >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3})
  63. 0.5
  64. Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings).
  65. Segmentations are represented using strings of zeros and ones.
  66. >>> s1 = "000100000010"
  67. >>> s2 = "000010000100"
  68. >>> s3 = "100000010000"
  69. >>> s4 = "000000000000"
  70. >>> s5 = "111111111111"
  71. >>> windowdiff(s1, s1, 3)
  72. 0.0
  73. >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3
  74. True
  75. >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8
  76. True
  77. >>> windowdiff(s1, s4, 3)
  78. 0.5
  79. >>> windowdiff(s1, s5, 3)
  80. 1.0
  81. ----------------
  82. Confusion Matrix
  83. ----------------
  84. >>> reference = 'This is the reference data. Testing 123. aoaeoeoe'
  85. >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe'
  86. >>> print(ConfusionMatrix(reference, test))
  87. | . 1 2 3 T _ a c d e f g h i n o r s t z |
  88. --+-------------------------------------------+
  89. |<8>. . . . . 1 . . . . . . . . . . . . . . |
  90. . | .<2>. . . . . . . . . . . . . . . . . . . |
  91. 1 | . .<1>. . . . . . . . . . . . . . . . . . |
  92. 2 | . . .<1>. . . . . . . . . . . . . . . . . |
  93. 3 | . . . .<1>. . . . . . . . . . . . . . . . |
  94. T | . . . . .<2>. . . . . . . . . . . . . . . |
  95. _ | . . . . . .<.>. . . . . . . . . . . . . . |
  96. a | . . . . . . .<4>. . . . . . . . . . . . . |
  97. c | . . . . . . . .<1>. . . . . . . . . . . . |
  98. d | . . . . . . . . .<1>. . . . . . . . . . . |
  99. e | . . . . . . . . . .<6>. . . 3 . . . . . . |
  100. f | . . . . . . . . . . .<1>. . . . . . . . . |
  101. g | . . . . . . . . . . . .<1>. . . . . . . . |
  102. h | . . . . . . . . . . . . .<2>. . . . . . . |
  103. i | . . . . . . . . . . 1 . . .<1>. 1 . . . . |
  104. n | . . . . . . . . . . . . . . .<2>. . . . . |
  105. o | . . . . . . . . . . . . . . . .<3>. . . . |
  106. r | . . . . . . . . . . . . . . . . .<2>. . . |
  107. s | . . . . . . . . . . . . . . . . . .<2>. 1 |
  108. t | . . . . . . . . . . . . . . . . . . .<3>. |
  109. z | . . . . . . . . . . . . . . . . . . . .<.>|
  110. --+-------------------------------------------+
  111. (row = reference; col = test)
  112. <BLANKLINE>
  113. >>> cm = ConfusionMatrix(reference, test)
  114. >>> print(cm.pretty_format(sort_by_count=True))
  115. | e a i o s t . T h n r 1 2 3 c d f g _ z |
  116. --+-------------------------------------------+
  117. |<8>. . . . . . . . . . . . . . . . . . 1 . |
  118. e | .<6>. 3 . . . . . . . . . . . . . . . . . |
  119. a | . .<4>. . . . . . . . . . . . . . . . . . |
  120. i | . 1 .<1>1 . . . . . . . . . . . . . . . . |
  121. o | . . . .<3>. . . . . . . . . . . . . . . . |
  122. s | . . . . .<2>. . . . . . . . . . . . . . 1 |
  123. t | . . . . . .<3>. . . . . . . . . . . . . . |
  124. . | . . . . . . .<2>. . . . . . . . . . . . . |
  125. T | . . . . . . . .<2>. . . . . . . . . . . . |
  126. h | . . . . . . . . .<2>. . . . . . . . . . . |
  127. n | . . . . . . . . . .<2>. . . . . . . . . . |
  128. r | . . . . . . . . . . .<2>. . . . . . . . . |
  129. 1 | . . . . . . . . . . . .<1>. . . . . . . . |
  130. 2 | . . . . . . . . . . . . .<1>. . . . . . . |
  131. 3 | . . . . . . . . . . . . . .<1>. . . . . . |
  132. c | . . . . . . . . . . . . . . .<1>. . . . . |
  133. d | . . . . . . . . . . . . . . . .<1>. . . . |
  134. f | . . . . . . . . . . . . . . . . .<1>. . . |
  135. g | . . . . . . . . . . . . . . . . . .<1>. . |
  136. _ | . . . . . . . . . . . . . . . . . . .<.>. |
  137. z | . . . . . . . . . . . . . . . . . . . .<.>|
  138. --+-------------------------------------------+
  139. (row = reference; col = test)
  140. <BLANKLINE>
  141. >>> print(cm.pretty_format(sort_by_count=True, truncate=10))
  142. | e a i o s t . T h |
  143. --+---------------------+
  144. |<8>. . . . . . . . . |
  145. e | .<6>. 3 . . . . . . |
  146. a | . .<4>. . . . . . . |
  147. i | . 1 .<1>1 . . . . . |
  148. o | . . . .<3>. . . . . |
  149. s | . . . . .<2>. . . . |
  150. t | . . . . . .<3>. . . |
  151. . | . . . . . . .<2>. . |
  152. T | . . . . . . . .<2>. |
  153. h | . . . . . . . . .<2>|
  154. --+---------------------+
  155. (row = reference; col = test)
  156. <BLANKLINE>
  157. >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False))
  158. | 1 |
  159. | 1 2 3 4 5 6 7 8 9 0 |
  160. ---+---------------------+
  161. 1 |<8>. . . . . . . . . |
  162. 2 | .<6>. 3 . . . . . . |
  163. 3 | . .<4>. . . . . . . |
  164. 4 | . 1 .<1>1 . . . . . |
  165. 5 | . . . .<3>. . . . . |
  166. 6 | . . . . .<2>. . . . |
  167. 7 | . . . . . .<3>. . . |
  168. 8 | . . . . . . .<2>. . |
  169. 9 | . . . . . . . .<2>. |
  170. 10 | . . . . . . . . .<2>|
  171. ---+---------------------+
  172. (row = reference; col = test)
  173. Value key:
  174. 1:
  175. 2: e
  176. 3: a
  177. 4: i
  178. 5: o
  179. 6: s
  180. 7: t
  181. 8: .
  182. 9: T
  183. 10: h
  184. <BLANKLINE>
  185. --------------------
  186. Association measures
  187. --------------------
  188. These measures are useful to determine whether the coocurrence of two random
  189. events is meaningful. They are used, for instance, to distinguish collocations
  190. from other pairs of adjacent words.
  191. We bring some examples of bigram association calculations from Manning and
  192. Schutze's SNLP, 2nd Ed. chapter 5.
  193. >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668
  194. >>> bam = BigramAssocMeasures
  195. >>> bam.raw_freq(20, (42, 20), N) == 20. / N
  196. True
  197. >>> bam.student_t(n_new_companies, (n_new, n_companies), N)
  198. 0.999...
  199. >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N)
  200. 1.54...
  201. >>> bam.likelihood_ratio(150, (12593, 932), N)
  202. 1291...
  203. For other associations, we ensure the ordering of the measures:
  204. >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N)
  205. True
  206. >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N)
  207. True
  208. >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N)
  209. True
  210. >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N)
  211. True
  212. >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N)
  213. True
  214. >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
  215. True
  216. >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
  217. False
  218. For trigrams, we have to provide more count information:
  219. >>> n_w1_w2_w3 = 20
  220. >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
  221. >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
  222. >>> n_w1, n_w2, n_w3 = 100, 200, 300
  223. >>> uni_counts = (n_w1, n_w2, n_w3)
  224. >>> N = 14307668
  225. >>> tam = TrigramAssocMeasures
  226. >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N
  227. True
  228. >>> uni_counts2 = (n_w1, n_w2, 100)
  229. >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N)
  230. True
  231. >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N)
  232. True
  233. >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N)
  234. True
  235. >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N)
  236. True
  237. >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N)
  238. True
  239. >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N)
  240. True
  241. >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N)
  242. True
  243. For fourgrams, we have to provide more count information:
  244. >>> n_w1_w2_w3_w4 = 5
  245. >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
  246. >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10
  247. >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
  248. >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4)
  249. >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400
  250. >>> uni_counts = (n_w1, n_w2, n_w3, n_w4)
  251. >>> N = 14307668
  252. >>> qam = QuadgramAssocMeasures
  253. >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N
  254. True