123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466 |
- # Natural Language Toolkit: Ngram Association Measures
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Joel Nothman <jnothman@student.usyd.edu.au>
- # URL: <http://nltk.org>
- # For license information, see LICENSE.TXT
- """
- Provides scoring functions for a number of association measures through a
- generic, abstract implementation in ``NgramAssocMeasures``, and n-specific
- ``BigramAssocMeasures`` and ``TrigramAssocMeasures``.
- """
- from __future__ import division
- import math as _math
- from abc import ABCMeta, abstractmethod
- from functools import reduce
- from six import add_metaclass
- _log2 = lambda x: _math.log(x, 2.0)
- _ln = _math.log
- _product = lambda s: reduce(lambda x, y: x * y, s)
- _SMALL = 1e-20
- try:
- from scipy.stats import fisher_exact
- except ImportError:
- def fisher_exact(*_args, **_kwargs):
- raise NotImplementedError
- ### Indices to marginals arguments:
- NGRAM = 0
- """Marginals index for the ngram count"""
- UNIGRAMS = -2
- """Marginals index for a tuple of each unigram count"""
- TOTAL = -1
- """Marginals index for the number of words in the data"""
- @add_metaclass(ABCMeta)
- class NgramAssocMeasures(object):
- """
- An abstract class defining a collection of generic association measures.
- Each public method returns a score, taking the following arguments::
- score_fn(count_of_ngram,
- (count_of_n-1gram_1, ..., count_of_n-1gram_j),
- (count_of_n-2gram_1, ..., count_of_n-2gram_k),
- ...,
- (count_of_1gram_1, ..., count_of_1gram_n),
- count_of_total_words)
- See ``BigramAssocMeasures`` and ``TrigramAssocMeasures``
- Inheriting classes should define a property _n, and a method _contingency
- which calculates contingency values from marginals in order for all
- association measures defined here to be usable.
- """
- _n = 0
- @staticmethod
- @abstractmethod
- def _contingency(*marginals):
- """Calculates values of a contingency table from marginal values."""
- raise NotImplementedError(
- "The contingency table is not available" "in the general ngram case"
- )
- @staticmethod
- @abstractmethod
- def _marginals(*contingency):
- """Calculates values of contingency table marginals from its values."""
- raise NotImplementedError(
- "The contingency table is not available" "in the general ngram case"
- )
- @classmethod
- def _expected_values(cls, cont):
- """Calculates expected values for a contingency table."""
- n_all = sum(cont)
- bits = [1 << i for i in range(cls._n)]
- # For each contingency table cell
- for i in range(len(cont)):
- # Yield the expected value
- yield (
- _product(
- sum(cont[x] for x in range(2 ** cls._n) if (x & j) == (i & j))
- for j in bits
- )
- / (n_all ** (cls._n - 1))
- )
- @staticmethod
- def raw_freq(*marginals):
- """Scores ngrams by their frequency"""
- return marginals[NGRAM] / marginals[TOTAL]
- @classmethod
- def student_t(cls, *marginals):
- """Scores ngrams using Student's t test with independence hypothesis
- for unigrams, as in Manning and Schutze 5.3.1.
- """
- return (
- marginals[NGRAM]
- - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
- ) / (marginals[NGRAM] + _SMALL) ** 0.5
- @classmethod
- def chi_sq(cls, *marginals):
- """Scores ngrams using Pearson's chi-square as in Manning and Schutze
- 5.3.3.
- """
- cont = cls._contingency(*marginals)
- exps = cls._expected_values(cont)
- return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps))
- @staticmethod
- def mi_like(*marginals, **kwargs):
- """Scores ngrams using a variant of mutual information. The keyword
- argument power sets an exponent (default 3) for the numerator. No
- logarithm of the result is calculated.
- """
- return marginals[NGRAM] ** kwargs.get('power', 3) / _product(
- marginals[UNIGRAMS]
- )
- @classmethod
- def pmi(cls, *marginals):
- """Scores ngrams by pointwise mutual information, as in Manning and
- Schutze 5.4.
- """
- return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(
- _product(marginals[UNIGRAMS])
- )
- @classmethod
- def likelihood_ratio(cls, *marginals):
- """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.
- """
- cont = cls._contingency(*marginals)
- return cls._n * sum(
- obs * _ln(obs / (exp + _SMALL) + _SMALL)
- for obs, exp in zip(cont, cls._expected_values(cont))
- )
- @classmethod
- def poisson_stirling(cls, *marginals):
- """Scores ngrams using the Poisson-Stirling measure."""
- exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1))
- return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1)
- @classmethod
- def jaccard(cls, *marginals):
- """Scores ngrams using the Jaccard index."""
- cont = cls._contingency(*marginals)
- return cont[0] / sum(cont[:-1])
- class BigramAssocMeasures(NgramAssocMeasures):
- """
- A collection of bigram association measures. Each association measure
- is provided as a function with three arguments::
- bigram_score_fn(n_ii, (n_ix, n_xi), n_xx)
- The arguments constitute the marginals of a contingency table, counting
- the occurrences of particular events in a corpus. The letter i in the
- suffix refers to the appearance of the word in question, while x indicates
- the appearance of any word. Thus, for example:
- n_ii counts (w1, w2), i.e. the bigram being scored
- n_ix counts (w1, *)
- n_xi counts (*, w2)
- n_xx counts (*, *), i.e. any bigram
- This may be shown with respect to a contingency table::
- w1 ~w1
- ------ ------
- w2 | n_ii | n_oi | = n_xi
- ------ ------
- ~w2 | n_io | n_oo |
- ------ ------
- = n_ix TOTAL = n_xx
- """
- _n = 2
- @staticmethod
- def _contingency(n_ii, n_ix_xi_tuple, n_xx):
- """Calculates values of a bigram contingency table from marginal values."""
- (n_ix, n_xi) = n_ix_xi_tuple
- n_oi = n_xi - n_ii
- n_io = n_ix - n_ii
- return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io)
- @staticmethod
- def _marginals(n_ii, n_oi, n_io, n_oo):
- """Calculates values of contingency table marginals from its values."""
- return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii)
- @staticmethod
- def _expected_values(cont):
- """Calculates expected values for a contingency table."""
- n_xx = sum(cont)
- # For each contingency table cell
- for i in range(4):
- yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx
- @classmethod
- def phi_sq(cls, *marginals):
- """Scores bigrams using phi-square, the square of the Pearson correlation
- coefficient.
- """
- n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
- return (n_ii * n_oo - n_io * n_oi) ** 2 / (
- (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)
- )
- @classmethod
- def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx):
- """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
- of bigrams, as in Manning and Schutze 5.3.3.
- """
- (n_ix, n_xi) = n_ix_xi_tuple
- return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
- @classmethod
- def fisher(cls, *marginals):
- """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less
- sensitive to small counts than PMI or Chi Sq, but also more expensive
- to compute. Requires scipy.
- """
- n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals)
- (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less')
- return pvalue
- @staticmethod
- def dice(n_ii, n_ix_xi_tuple, n_xx):
- """Scores bigrams using Dice's coefficient."""
- (n_ix, n_xi) = n_ix_xi_tuple
- return 2 * n_ii / (n_ix + n_xi)
- class TrigramAssocMeasures(NgramAssocMeasures):
- """
- A collection of trigram association measures. Each association measure
- is provided as a function with four arguments::
- trigram_score_fn(n_iii,
- (n_iix, n_ixi, n_xii),
- (n_ixx, n_xix, n_xxi),
- n_xxx)
- The arguments constitute the marginals of a contingency table, counting
- the occurrences of particular events in a corpus. The letter i in the
- suffix refers to the appearance of the word in question, while x indicates
- the appearance of any word. Thus, for example:
- n_iii counts (w1, w2, w3), i.e. the trigram being scored
- n_ixx counts (w1, *, *)
- n_xxx counts (*, *, *), i.e. any trigram
- """
- _n = 3
- @staticmethod
- def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx):
- """Calculates values of a trigram contingency table (or cube) from
- marginal values.
- >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000)
- (1, 0, 0, 0, 0, 72, 0, 1927)
- """
- (n_iix, n_ixi, n_xii) = n_iix_tuple
- (n_ixx, n_xix, n_xxi) = n_ixx_tuple
- n_oii = n_xii - n_iii
- n_ioi = n_ixi - n_iii
- n_iio = n_iix - n_iii
- n_ooi = n_xxi - n_iii - n_oii - n_ioi
- n_oio = n_xix - n_iii - n_oii - n_iio
- n_ioo = n_ixx - n_iii - n_ioi - n_iio
- n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo
- return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo)
- @staticmethod
- def _marginals(*contingency):
- """Calculates values of contingency table marginals from its values.
- >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927)
- (1, (1, 1, 1), (1, 73, 1), 2000)
- """
- n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency
- return (
- n_iii,
- (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii),
- (
- n_iii + n_ioi + n_iio + n_ioo,
- n_iii + n_oii + n_iio + n_oio,
- n_iii + n_oii + n_ioi + n_ooi,
- ),
- sum(contingency),
- )
- class QuadgramAssocMeasures(NgramAssocMeasures):
- """
- A collection of quadgram association measures. Each association measure
- is provided as a function with five arguments::
- trigram_score_fn(n_iiii,
- (n_iiix, n_iixi, n_ixii, n_xiii),
- (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
- (n_ixxx, n_xixx, n_xxix, n_xxxi),
- n_all)
- The arguments constitute the marginals of a contingency table, counting
- the occurrences of particular events in a corpus. The letter i in the
- suffix refers to the appearance of the word in question, while x indicates
- the appearance of any word. Thus, for example:
- n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored
- n_ixxi counts (w1, *, *, w4)
- n_xxxx counts (*, *, *, *), i.e. any quadgram
- """
- _n = 4
- @staticmethod
- def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx):
- """Calculates values of a quadgram contingency table from
- marginal values.
- """
- (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple
- (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple
- (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple
- n_oiii = n_xiii - n_iiii
- n_ioii = n_ixii - n_iiii
- n_iioi = n_iixi - n_iiii
- n_ooii = n_xxii - n_iiii - n_oiii - n_ioii
- n_oioi = n_xixi - n_iiii - n_oiii - n_iioi
- n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi
- n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi
- n_iiio = n_iiix - n_iiii
- n_oiio = n_xiix - n_iiii - n_oiii - n_iiio
- n_ioio = n_ixix - n_iiii - n_ioii - n_iiio
- n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio
- n_iioo = n_iixx - n_iiii - n_iioi - n_iiio
- n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo
- n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio
- n_oooo = (
- n_xxxx
- - n_iiii
- - n_oiii
- - n_ioii
- - n_iioi
- - n_ooii
- - n_oioi
- - n_iooi
- - n_oooi
- - n_iiio
- - n_oiio
- - n_ioio
- - n_ooio
- - n_iioo
- - n_oioo
- - n_iooo
- )
- return (
- n_iiii,
- n_oiii,
- n_ioii,
- n_ooii,
- n_iioi,
- n_oioi,
- n_iooi,
- n_oooi,
- n_iiio,
- n_oiio,
- n_ioio,
- n_ooio,
- n_iioo,
- n_oioo,
- n_iooo,
- n_oooo,
- )
- @staticmethod
- def _marginals(*contingency):
- """Calculates values of contingency table marginals from its values.
- QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653)
- (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540)
- """
- n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo = (
- contingency
- )
- n_iiix = n_iiii + n_iiio
- n_iixi = n_iiii + n_iioi
- n_ixii = n_iiii + n_ioii
- n_xiii = n_iiii + n_oiii
- n_iixx = n_iiii + n_iioi + n_iiio + n_iioo
- n_ixix = n_iiii + n_ioii + n_iiio + n_ioio
- n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi
- n_xixi = n_iiii + n_oiii + n_iioi + n_oioi
- n_xxii = n_iiii + n_oiii + n_ioii + n_ooii
- n_xiix = n_iiii + n_oiii + n_iiio + n_oiio
- n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo
- n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo
- n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio
- n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi
- n_all = sum(contingency)
- return (
- n_iiii,
- (n_iiix, n_iixi, n_ixii, n_xiii),
- (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix),
- (n_ixxx, n_xixx, n_xxix, n_xxxi),
- n_all,
- )
- class ContingencyMeasures(object):
- """Wraps NgramAssocMeasures classes such that the arguments of association
- measures are contingency table values rather than marginals.
- """
- def __init__(self, measures):
- """Constructs a ContingencyMeasures given a NgramAssocMeasures class"""
- self.__class__.__name__ = 'Contingency' + measures.__class__.__name__
- for k in dir(measures):
- if k.startswith('__'):
- continue
- v = getattr(measures, k)
- if not k.startswith('_'):
- v = self._make_contingency_fn(measures, v)
- setattr(self, k, v)
- @staticmethod
- def _make_contingency_fn(measures, old_fn):
- """From an association measure function, produces a new function which
- accepts contingency table values as its arguments.
- """
- def res(*contingency):
- return old_fn(*measures._marginals(*contingency))
- res.__doc__ = old_fn.__doc__
- res.__name__ = old_fn.__name__
- return res
|