counter.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Language Model Counter
  10. ----------------------
  11. """
  12. from __future__ import unicode_literals
  13. from collections import Sequence, defaultdict
  14. from six import string_types
  15. from nltk import compat
  16. from nltk.probability import ConditionalFreqDist, FreqDist
  17. @compat.python_2_unicode_compatible
  18. class NgramCounter(object):
  19. """Class for counting ngrams.
  20. Will count any ngram sequence you give it ;)
  21. First we need to make sure we are feeding the counter sentences of ngrams.
  22. >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
  23. >>> from nltk.util import ngrams
  24. >>> text_bigrams = [ngrams(sent, 2) for sent in text]
  25. >>> text_unigrams = [ngrams(sent, 1) for sent in text]
  26. The counting itself is very simple.
  27. >>> from nltk.lm import NgramCounter
  28. >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
  29. You can conveniently access ngram counts using standard python dictionary notation.
  30. String keys will give you unigram counts.
  31. >>> ngram_counts['a']
  32. 2
  33. >>> ngram_counts['aliens']
  34. 0
  35. If you want to access counts for higher order ngrams, use a list or a tuple.
  36. These are treated as "context" keys, so what you get is a frequency distribution
  37. over all continuations after the given context.
  38. >>> sorted(ngram_counts[['a']].items())
  39. [('b', 1), ('c', 1)]
  40. >>> sorted(ngram_counts[('a',)].items())
  41. [('b', 1), ('c', 1)]
  42. This is equivalent to specifying explicitly the order of the ngram (in this case
  43. 2 for bigram) and indexing on the context.
  44. >>> ngram_counts[2][('a',)] is ngram_counts[['a']]
  45. True
  46. Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
  47. It is generally advisable to use the less verbose and more flexible square
  48. bracket notation.
  49. To get the count of the full ngram "a b", do this:
  50. >>> ngram_counts[['a']]['b']
  51. 1
  52. Specifying the ngram order as a number can be useful for accessing all ngrams
  53. in that order.
  54. >>> ngram_counts[2]
  55. <ConditionalFreqDist with 4 conditions>
  56. The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
  57. Unigrams can also be accessed with a human-friendly alias.
  58. >>> ngram_counts.unigrams is ngram_counts[1]
  59. True
  60. Similarly to `collections.Counter`, you can update counts after initialization.
  61. >>> ngram_counts['e']
  62. 0
  63. >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
  64. >>> ngram_counts['e']
  65. 1
  66. """
  67. def __init__(self, ngram_text=None):
  68. """Creates a new NgramCounter.
  69. If `ngram_text` is specified, counts ngrams from it, otherwise waits for
  70. `update` method to be called explicitly.
  71. :param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
  72. :type ngram_text: Iterable(Iterable(tuple(str))) or None
  73. """
  74. self._counts = defaultdict(ConditionalFreqDist)
  75. self._counts[1] = self.unigrams = FreqDist()
  76. if ngram_text:
  77. self.update(ngram_text)
  78. def update(self, ngram_text):
  79. """Updates ngram counts from `ngram_text`.
  80. Expects `ngram_text` to be a sequence of sentences (sequences).
  81. Each sentence consists of ngrams as tuples of strings.
  82. :param Iterable(Iterable(tuple(str))) ngram_text: Text containing senteces of ngrams.
  83. :raises TypeError: if the ngrams are not tuples.
  84. """
  85. for sent in ngram_text:
  86. for ngram in sent:
  87. if not isinstance(ngram, tuple):
  88. raise TypeError(
  89. "Ngram <{0}> isn't a tuple, "
  90. "but {1}".format(ngram, type(ngram))
  91. )
  92. ngram_order = len(ngram)
  93. if ngram_order == 1:
  94. self.unigrams[ngram[0]] += 1
  95. continue
  96. context, word = ngram[:-1], ngram[-1]
  97. self[ngram_order][context][word] += 1
  98. def N(self):
  99. """Returns grand total number of ngrams stored.
  100. This includes ngrams from all orders, so some duplication is expected.
  101. :rtype: int
  102. >>> from nltk.lm import NgramCounter
  103. >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
  104. >>> counts.N()
  105. 3
  106. """
  107. return sum(val.N() for val in self._counts.values())
  108. def __getitem__(self, item):
  109. """User-friendly access to ngram counts."""
  110. if isinstance(item, int):
  111. return self._counts[item]
  112. elif isinstance(item, string_types):
  113. return self._counts.__getitem__(1)[item]
  114. elif isinstance(item, Sequence):
  115. return self._counts.__getitem__(len(item) + 1)[tuple(item)]
  116. def __str__(self):
  117. return "<{0} with {1} ngram orders and {2} ngrams>".format(
  118. self.__class__.__name__, len(self._counts), self.N()
  119. )
  120. def __len__(self):
  121. return self._counts.__len__()
  122. def __contains__(self, item):
  123. return item in self._counts