123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Language Model Counter
- ----------------------
- """
- from __future__ import unicode_literals
- from collections import Sequence, defaultdict
- from six import string_types
- from nltk import compat
- from nltk.probability import ConditionalFreqDist, FreqDist
- @compat.python_2_unicode_compatible
- class NgramCounter(object):
- """Class for counting ngrams.
- Will count any ngram sequence you give it ;)
- First we need to make sure we are feeding the counter sentences of ngrams.
- >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]]
- >>> from nltk.util import ngrams
- >>> text_bigrams = [ngrams(sent, 2) for sent in text]
- >>> text_unigrams = [ngrams(sent, 1) for sent in text]
- The counting itself is very simple.
- >>> from nltk.lm import NgramCounter
- >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams)
- You can conveniently access ngram counts using standard python dictionary notation.
- String keys will give you unigram counts.
- >>> ngram_counts['a']
- 2
- >>> ngram_counts['aliens']
- 0
- If you want to access counts for higher order ngrams, use a list or a tuple.
- These are treated as "context" keys, so what you get is a frequency distribution
- over all continuations after the given context.
- >>> sorted(ngram_counts[['a']].items())
- [('b', 1), ('c', 1)]
- >>> sorted(ngram_counts[('a',)].items())
- [('b', 1), ('c', 1)]
- This is equivalent to specifying explicitly the order of the ngram (in this case
- 2 for bigram) and indexing on the context.
- >>> ngram_counts[2][('a',)] is ngram_counts[['a']]
- True
- Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples!
- It is generally advisable to use the less verbose and more flexible square
- bracket notation.
- To get the count of the full ngram "a b", do this:
- >>> ngram_counts[['a']]['b']
- 1
- Specifying the ngram order as a number can be useful for accessing all ngrams
- in that order.
- >>> ngram_counts[2]
- <ConditionalFreqDist with 4 conditions>
- The keys of this `ConditionalFreqDist` are the contexts we discussed earlier.
- Unigrams can also be accessed with a human-friendly alias.
- >>> ngram_counts.unigrams is ngram_counts[1]
- True
- Similarly to `collections.Counter`, you can update counts after initialization.
- >>> ngram_counts['e']
- 0
- >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)])
- >>> ngram_counts['e']
- 1
- """
- def __init__(self, ngram_text=None):
- """Creates a new NgramCounter.
- If `ngram_text` is specified, counts ngrams from it, otherwise waits for
- `update` method to be called explicitly.
- :param ngram_text: Optional text containing senteces of ngrams, as for `update` method.
- :type ngram_text: Iterable(Iterable(tuple(str))) or None
- """
- self._counts = defaultdict(ConditionalFreqDist)
- self._counts[1] = self.unigrams = FreqDist()
- if ngram_text:
- self.update(ngram_text)
- def update(self, ngram_text):
- """Updates ngram counts from `ngram_text`.
- Expects `ngram_text` to be a sequence of sentences (sequences).
- Each sentence consists of ngrams as tuples of strings.
- :param Iterable(Iterable(tuple(str))) ngram_text: Text containing senteces of ngrams.
- :raises TypeError: if the ngrams are not tuples.
- """
- for sent in ngram_text:
- for ngram in sent:
- if not isinstance(ngram, tuple):
- raise TypeError(
- "Ngram <{0}> isn't a tuple, "
- "but {1}".format(ngram, type(ngram))
- )
- ngram_order = len(ngram)
- if ngram_order == 1:
- self.unigrams[ngram[0]] += 1
- continue
- context, word = ngram[:-1], ngram[-1]
- self[ngram_order][context][word] += 1
- def N(self):
- """Returns grand total number of ngrams stored.
- This includes ngrams from all orders, so some duplication is expected.
- :rtype: int
- >>> from nltk.lm import NgramCounter
- >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]])
- >>> counts.N()
- 3
- """
- return sum(val.N() for val in self._counts.values())
- def __getitem__(self, item):
- """User-friendly access to ngram counts."""
- if isinstance(item, int):
- return self._counts[item]
- elif isinstance(item, string_types):
- return self._counts.__getitem__(1)[item]
- elif isinstance(item, Sequence):
- return self._counts.__getitem__(len(item) + 1)[tuple(item)]
- def __str__(self):
- return "<{0} with {1} ngram orders and {2} ngrams>".format(
- self.__class__.__name__, len(self._counts), self.N()
- )
- def __len__(self):
- return self._counts.__len__()
- def __contains__(self, item):
- return item in self._counts
|