association.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. # Natural Language Toolkit: Ngram Association Measures
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Joel Nothman <jnothman@student.usyd.edu.au>
  5. # URL: <http://nltk.org>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Provides scoring functions for a number of association measures through a
  9. generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
  10. ``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
  11. """
  12. from __future__ import division
  13. import math as _math
  14. from abc import ABCMeta, abstractmethod
  15. from functools import reduce
  16. from six import add_metaclass
  17. _log2 = lambda x: _math.log(x, 2.0)
  18. _ln = _math.log
  19. _product = lambda s: reduce(lambda x, y: x * y, s)
  20. _SMALL = 1e-20
  21. try:
  22. from scipy.stats import fisher_exact
  23. except ImportError:
  24. def fisher_exact(*_args, **_kwargs):
  25. raise NotImplementedError
  26. ### Indices to marginals arguments:
  27. NGRAM = 0
  28. """Marginals index for the ngram count"""
  29. UNIGRAMS = -2
  30. """Marginals index for a tuple of each unigram count"""
  31. TOTAL = -1
  32. """Marginals index for the number of words in the data"""
  33. @add_metaclass(ABCMeta)
  34. class NgramAssocMeasures(object):
  35. """
  36. An abstract class defining a collection of generic association measures.
  37. Each public method returns a score, taking the following arguments::
  38. score_fn(count_of_ngram,
  39. (count_of_n-1gram_1, ..., count_of_n-1gram_j),
  40. (count_of_n-2gram_1, ..., count_of_n-2gram_k),
  41. ...,
  42. (count_of_1gram_1, ..., count_of_1gram_n),
  43. count_of_total_words)
  44. See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
  45. Inheriting classes should define a property _n, and a method _contingency
  46. which calculates contingency values from marginals in order for all
  47. association measures defined here to be usable.
  48. """
  49. _n = 0
  50. @staticmethod
  51. @abstractmethod
  52. def _contingency(*marginals):
  53. """Calculates values of a contingency table from marginal values."""
  54. raise NotImplementedError(
  55. "The contingency table is not available" "in the general ngram case"
  56. )
  57. @staticmethod
  58. @abstractmethod
  59. def _marginals(*contingency):
  60. """Calculates values of contingency table marginals from its values."""
  61. raise NotImplementedError(
  62. "The contingency table is not available" "in the general ngram case"
  63. )
  64. @classmethod
  65. def _expected_values(cls, cont):
  66. """Calculates expected values for a contingency table."""
  67. n_all = sum(cont)
  68. bits = [1 << i for i in range(cls._n)]
  69. # For each contingency table cell
  70. for i in range(len(cont)):
  71. # Yield the expected value
  72. yield (
  73. _product(
  74. sum(cont[x] for x in range(2 ** cls._n) if (x & j) == (i & j))
  75. for j in bits
  76. )
  77. / (n_all ** (cls._n - 1))
  78. )
  79. @staticmethod
  80. def raw_freq(*marginals):
  81. """Scores ngrams by their frequency"""
  82. return marginals[NGRAM] / marginals[TOTAL]
  83. @classmethod
  84. def student_t(cls, *marginals):
  85. """Scores ngrams using Student's t test with independence hypothesis
  86. for unigrams, as in Manning and Schutze 5.3.1.
  87. """
  88. return (
  89. marginals[NGRAM]
  90. - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
  91. ) / (marginals[NGRAM] + _SMALL) ** 0.5
  92. @classmethod
  93. def chi_sq(cls, *marginals):
  94. """Scores ngrams using Pearson's chi-square as in Manning and Schutze
  95. 5.3.3.
  96. """
  97. cont = cls._contingency(*marginals)
  98. exps = cls._expected_values(cont)
  99. return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
  100. @staticmethod
  101. def mi_like(*marginals, **kwargs):
  102. """Scores ngrams using a variant of mutual information. The keyword
  103. argument power sets an exponent (default 3) for the numerator. No
  104. logarithm of the result is calculated.
  105. """
  106. return marginals[NGRAM] ** kwargs.get('power', 3) / _product(
  107. marginals[UNIGRAMS]
  108. )
  109. @classmethod
  110. def pmi(cls, *marginals):
  111. """Scores ngrams by pointwise mutual information, as in Manning and
  112. Schutze 5.4.
  113. """
  114. return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
  115. _product(marginals[UNIGRAMS])
  116. )
  117. @classmethod
  118. def likelihood_ratio(cls, *marginals):
  119. """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
  120. """
  121. cont = cls._contingency(*marginals)
  122. return cls._n * sum(
  123. obs * _ln(obs / (exp + _SMALL) + _SMALL)
  124. for obs, exp in zip(cont, cls._expected_values(cont))
  125. )
  126. @classmethod
  127. def poisson_stirling(cls, *marginals):
  128. """Scores ngrams using the Poisson-Stirling measure."""
  129. exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
  130. return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
  131. @classmethod
  132. def jaccard(cls, *marginals):
  133. """Scores ngrams using the Jaccard index."""
  134. cont = cls._contingency(*marginals)
  135. return cont[0] / sum(cont[:-1])
  136. class BigramAssocMeasures(NgramAssocMeasures):
  137. """
  138. A collection of bigram association measures. Each association measure
  139. is provided as a function with three arguments::
  140. bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
  141. The arguments constitute the marginals of a contingency table, counting
  142. the occurrences of particular events in a corpus. The letter i in the
  143. suffix refers to the appearance of the word in question, while x indicates
  144. the appearance of any word. Thus, for example:
  145. n_ii counts (w1, w2), i.e. the bigram being scored
  146. n_ix counts (w1, *)
  147. n_xi counts (*, w2)
  148. n_xx counts (*, *), i.e. any bigram
  149. This may be shown with respect to a contingency table::
  150. w1 ~w1
  151. ------ ------
  152. w2 | n_ii | n_oi | = n_xi
  153. ------ ------
  154. ~w2 | n_io | n_oo |
  155. ------ ------
  156. = n_ix TOTAL = n_xx
  157. """
  158. _n = 2
  159. @staticmethod
  160. def _contingency(n_ii, n_ix_xi_tuple, n_xx):
  161. """Calculates values of a bigram contingency table from marginal values."""
  162. (n_ix, n_xi) = n_ix_xi_tuple
  163. n_oi = n_xi - n_ii
  164. n_io = n_ix - n_ii
  165. return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
  166. @staticmethod
  167. def _marginals(n_ii, n_oi, n_io, n_oo):
  168. """Calculates values of contingency table marginals from its values."""
  169. return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
  170. @staticmethod
  171. def _expected_values(cont):
  172. """Calculates expected values for a contingency table."""
  173. n_xx = sum(cont)
  174. # For each contingency table cell
  175. for i in range(4):
  176. yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
  177. @classmethod
  178. def phi_sq(cls, *marginals):
  179. """Scores bigrams using phi-square, the square of the Pearson correlation
  180. coefficient.
  181. """
  182. n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
  183. return (n_ii * n_oo - n_io * n_oi) ** 2 / (
  184. (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
  185. )
  186. @classmethod
  187. def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
  188. """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
  189. of bigrams, as in Manning and Schutze 5.3.3.
  190. """
  191. (n_ix, n_xi) = n_ix_xi_tuple
  192. return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
  193. @classmethod
  194. def fisher(cls, *marginals):
  195. """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
  196. sensitive to small counts than PMI or Chi Sq, but also more expensive
  197. to compute. Requires scipy.
  198. """
  199. n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
  200. (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
  201. return pvalue
  202. @staticmethod
  203. def dice(n_ii, n_ix_xi_tuple, n_xx):
  204. """Scores bigrams using Dice's coefficient."""
  205. (n_ix, n_xi) = n_ix_xi_tuple
  206. return 2 * n_ii / (n_ix + n_xi)
  207. class TrigramAssocMeasures(NgramAssocMeasures):
  208. """
  209. A collection of trigram association measures. Each association measure
  210. is provided as a function with four arguments::
  211. trigram_score_fn(n_iii,
  212. (n_iix, n_ixi, n_xii),
  213. (n_ixx, n_xix, n_xxi),
  214. n_xxx)
  215. The arguments constitute the marginals of a contingency table, counting
  216. the occurrences of particular events in a corpus. The letter i in the
  217. suffix refers to the appearance of the word in question, while x indicates
  218. the appearance of any word. Thus, for example:
  219. n_iii counts (w1, w2, w3), i.e. the trigram being scored
  220. n_ixx counts (w1, *, *)
  221. n_xxx counts (*, *, *), i.e. any trigram
  222. """
  223. _n = 3
  224. @staticmethod
  225. def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
  226. """Calculates values of a trigram contingency table (or cube) from
  227. marginal values.
  228. >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
  229. (1, 0, 0, 0, 0, 72, 0, 1927)
  230. """
  231. (n_iix, n_ixi, n_xii) = n_iix_tuple
  232. (n_ixx, n_xix, n_xxi) = n_ixx_tuple
  233. n_oii = n_xii - n_iii
  234. n_ioi = n_ixi - n_iii
  235. n_iio = n_iix - n_iii
  236. n_ooi = n_xxi - n_iii - n_oii - n_ioi
  237. n_oio = n_xix - n_iii - n_oii - n_iio
  238. n_ioo = n_ixx - n_iii - n_ioi - n_iio
  239. n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
  240. return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
  241. @staticmethod
  242. def _marginals(*contingency):
  243. """Calculates values of contingency table marginals from its values.
  244. >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
  245. (1, (1, 1, 1), (1, 73, 1), 2000)
  246. """
  247. n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
  248. return (
  249. n_iii,
  250. (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
  251. (
  252. n_iii + n_ioi + n_iio + n_ioo,
  253. n_iii + n_oii + n_iio + n_oio,
  254. n_iii + n_oii + n_ioi + n_ooi,
  255. ),
  256. sum(contingency),
  257. )
  258. class QuadgramAssocMeasures(NgramAssocMeasures):
  259. """
  260. A collection of quadgram association measures. Each association measure
  261. is provided as a function with five arguments::
  262. trigram_score_fn(n_iiii,
  263. (n_iiix, n_iixi, n_ixii, n_xiii),
  264. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
  265. (n_ixxx, n_xixx, n_xxix, n_xxxi),
  266. n_all)
  267. The arguments constitute the marginals of a contingency table, counting
  268. the occurrences of particular events in a corpus. The letter i in the
  269. suffix refers to the appearance of the word in question, while x indicates
  270. the appearance of any word. Thus, for example:
  271. n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
  272. n_ixxi counts (w1, *, *, w4)
  273. n_xxxx counts (*, *, *, *), i.e. any quadgram
  274. """
  275. _n = 4
  276. @staticmethod
  277. def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
  278. """Calculates values of a quadgram contingency table from
  279. marginal values.
  280. """
  281. (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
  282. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
  283. (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
  284. n_oiii = n_xiii - n_iiii
  285. n_ioii = n_ixii - n_iiii
  286. n_iioi = n_iixi - n_iiii
  287. n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
  288. n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
  289. n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
  290. n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
  291. n_iiio = n_iiix - n_iiii
  292. n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
  293. n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
  294. n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
  295. n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
  296. n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
  297. n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
  298. n_oooo = (
  299. n_xxxx
  300. - n_iiii
  301. - n_oiii
  302. - n_ioii
  303. - n_iioi
  304. - n_ooii
  305. - n_oioi
  306. - n_iooi
  307. - n_oooi
  308. - n_iiio
  309. - n_oiio
  310. - n_ioio
  311. - n_ooio
  312. - n_iioo
  313. - n_oioo
  314. - n_iooo
  315. )
  316. return (
  317. n_iiii,
  318. n_oiii,
  319. n_ioii,
  320. n_ooii,
  321. n_iioi,
  322. n_oioi,
  323. n_iooi,
  324. n_oooi,
  325. n_iiio,
  326. n_oiio,
  327. n_ioio,
  328. n_ooio,
  329. n_iioo,
  330. n_oioo,
  331. n_iooo,
  332. n_oooo,
  333. )
  334. @staticmethod
  335. def _marginals(*contingency):
  336. """Calculates values of contingency table marginals from its values.
  337. QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
  338. (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
  339. """
  340. n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo = (
  341. contingency
  342. )
  343. n_iiix = n_iiii + n_iiio
  344. n_iixi = n_iiii + n_iioi
  345. n_ixii = n_iiii + n_ioii
  346. n_xiii = n_iiii + n_oiii
  347. n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
  348. n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
  349. n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
  350. n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
  351. n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
  352. n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
  353. n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
  354. n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
  355. n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
  356. n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
  357. n_all = sum(contingency)
  358. return (
  359. n_iiii,
  360. (n_iiix, n_iixi, n_ixii, n_xiii),
  361. (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
  362. (n_ixxx, n_xixx, n_xxix, n_xxxi),
  363. n_all,
  364. )
  365. class ContingencyMeasures(object):
  366. """Wraps NgramAssocMeasures classes such that the arguments of association
  367. measures are contingency table values rather than marginals.
  368. """
  369. def __init__(self, measures):
  370. """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
  371. self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
  372. for k in dir(measures):
  373. if k.startswith('__'):
  374. continue
  375. v = getattr(measures, k)
  376. if not k.startswith('_'):
  377. v = self._make_contingency_fn(measures, v)
  378. setattr(self, k, v)
  379. @staticmethod
  380. def _make_contingency_fn(measures, old_fn):
  381. """From an association measure function, produces a new function which
  382. accepts contingency table values as its arguments.
  383. """
  384. def res(*contingency):
  385. return old_fn(*measures._marginals(*contingency))
  386. res.__doc__ = old_fn.__doc__
  387. res.__name__ = old_fn.__name__
  388. return res