123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764 |
- # Natural Language Toolkit: Texts
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- This module brings together a variety of NLTK functionality for
- text analysis, and provides simple, interactive interfaces.
- Functionality includes: concordancing, collocation discovery,
- regular expression search over tokenized strings, and
- distributional similarity.
- """
- from __future__ import print_function, division, unicode_literals, absolute_import
- from math import log
- from collections import defaultdict, Counter, namedtuple
- from functools import reduce
- import re
- import sys
- from six import text_type
- from nltk.lm import MLE
- from nltk.lm.preprocessing import padded_everygram_pipeline
- from nltk.probability import FreqDist
- from nltk.probability import ConditionalFreqDist as CFD
- from nltk.util import tokenwrap, LazyConcatenation
- from nltk.metrics import f_measure, BigramAssocMeasures
- from nltk.collocations import BigramCollocationFinder
- from nltk.compat import python_2_unicode_compatible
- from nltk.tokenize import sent_tokenize
- ConcordanceLine = namedtuple(
- "ConcordanceLine",
- ["left", "query", "right", "offset", "left_print", "right_print", "line"],
- )
- class ContextIndex(object):
- """
- A bidirectional index between words and their 'contexts' in a text.
- The context of a word is usually defined to be the words that occur
- in a fixed window around the word; but other definitions may also
- be used by providing a custom context function.
- """
- @staticmethod
- def _default_context(tokens, i):
- """One left token and one right token, normalized to lowercase"""
- left = tokens[i - 1].lower() if i != 0 else "*START*"
- right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*"
- return (left, right)
- def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
- self._key = key
- self._tokens = tokens
- if context_func:
- self._context_func = context_func
- else:
- self._context_func = self._default_context
- if filter:
- tokens = [t for t in tokens if filter(t)]
- self._word_to_contexts = CFD(
- (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
- )
- self._context_to_words = CFD(
- (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
- )
- def tokens(self):
- """
- :rtype: list(str)
- :return: The document that this context index was
- created from.
- """
- return self._tokens
- def word_similarity_dict(self, word):
- """
- Return a dictionary mapping from words to 'similarity scores,'
- indicating how often these two words occur in the same
- context.
- """
- word = self._key(word)
- word_contexts = set(self._word_to_contexts[word])
- scores = {}
- for w, w_contexts in self._word_to_contexts.items():
- scores[w] = f_measure(word_contexts, set(w_contexts))
- return scores
- def similar_words(self, word, n=20):
- scores = defaultdict(int)
- for c in self._word_to_contexts[self._key(word)]:
- for w in self._context_to_words[c]:
- if w != word:
- scores[w] += (
- self._context_to_words[c][word] * self._context_to_words[c][w]
- )
- return sorted(scores, key=scores.get, reverse=True)[:n]
- def common_contexts(self, words, fail_on_unknown=False):
- """
- Find contexts where the specified words can all appear; and
- return a frequency distribution mapping each context to the
- number of times that context was used.
- :param words: The words used to seed the similarity search
- :type words: str
- :param fail_on_unknown: If true, then raise a value error if
- any of the given words do not occur at all in the index.
- """
- words = [self._key(w) for w in words]
- contexts = [set(self._word_to_contexts[w]) for w in words]
- empty = [words[i] for i in range(len(words)) if not contexts[i]]
- common = reduce(set.intersection, contexts)
- if empty and fail_on_unknown:
- raise ValueError("The following word(s) were not found:", " ".join(words))
- elif not common:
- # nothing in common -- just return an empty freqdist.
- return FreqDist()
- else:
- fd = FreqDist(
- c for w in words for c in self._word_to_contexts[w] if c in common
- )
- return fd
- @python_2_unicode_compatible
- class ConcordanceIndex(object):
- """
- An index that can be used to look up the offset locations at which
- a given word occurs in a document.
- """
- def __init__(self, tokens, key=lambda x: x):
- """
- Construct a new concordance index.
- :param tokens: The document (list of tokens) that this
- concordance index was created from. This list can be used
- to access the context of a given word occurrence.
- :param key: A function that maps each token to a normalized
- version that will be used as a key in the index. E.g., if
- you use ``key=lambda s:s.lower()``, then the index will be
- case-insensitive.
- """
- self._tokens = tokens
- """The document (list of tokens) that this concordance index
- was created from."""
- self._key = key
- """Function mapping each token to an index key (or None)."""
- self._offsets = defaultdict(list)
- """Dictionary mapping words (or keys) to lists of offset indices."""
- # Initialize the index (self._offsets)
- for index, word in enumerate(tokens):
- word = self._key(word)
- self._offsets[word].append(index)
- def tokens(self):
- """
- :rtype: list(str)
- :return: The document that this concordance index was
- created from.
- """
- return self._tokens
- def offsets(self, word):
- """
- :rtype: list(int)
- :return: A list of the offset positions at which the given
- word occurs. If a key function was specified for the
- index, then given word's key will be looked up.
- """
- word = self._key(word)
- return self._offsets[word]
- def __repr__(self):
- return "<ConcordanceIndex for %d tokens (%d types)>" % (
- len(self._tokens),
- len(self._offsets),
- )
- def find_concordance(self, word, width=80):
- """
- Find all concordance lines given the query word.
- """
- half_width = (width - len(word) - 2) // 2
- context = width // 4 # approx number of words of context
- # Find the instances of the word to create the ConcordanceLine
- concordance_list = []
- offsets = self.offsets(word)
- if offsets:
- for i in offsets:
- query_word = self._tokens[i]
- # Find the context of query word.
- left_context = self._tokens[max(0, i - context) : i]
- right_context = self._tokens[i + 1 : i + context]
- # Create the pretty lines with the query_word in the middle.
- left_print = " ".join(left_context)[-half_width:]
- right_print = " ".join(right_context)[:half_width]
- # The WYSIWYG line of the concordance.
- line_print = " ".join([left_print, query_word, right_print])
- # Create the ConcordanceLine
- concordance_line = ConcordanceLine(
- left_context,
- query_word,
- right_context,
- i,
- left_print,
- right_print,
- line_print,
- )
- concordance_list.append(concordance_line)
- return concordance_list
- def print_concordance(self, word, width=80, lines=25):
- """
- Print concordance lines given the query word.
- :param word: The target word
- :type word: str
- :param lines: The number of lines to display (default=25)
- :type lines: int
- :param width: The width of each line, in characters (default=80)
- :type width: int
- :param save: The option to save the concordance.
- :type save: bool
- """
- concordance_list = self.find_concordance(word, width=width)
- if not concordance_list:
- print("no matches")
- else:
- lines = min(lines, len(concordance_list))
- print("Displaying {} of {} matches:".format(lines, len(concordance_list)))
- for i, concordance_line in enumerate(concordance_list[:lines]):
- print(concordance_line.line)
- class TokenSearcher(object):
- """
- A class that makes it easier to use regular expressions to search
- over tokenized strings. The tokenized string is converted to a
- string where tokens are marked with angle brackets -- e.g.,
- ``'<the><window><is><still><open>'``. The regular expression
- passed to the ``findall()`` method is modified to treat angle
- brackets as non-capturing parentheses, in addition to matching the
- token boundaries; and to have ``'.'`` not match the angle brackets.
- """
- def __init__(self, tokens):
- self._raw = "".join("<" + w + ">" for w in tokens)
- def findall(self, regexp):
- """
- Find instances of the regular expression in the text.
- The text is a list of tokens, and a regexp pattern to match
- a single token must be surrounded by angle brackets. E.g.
- >>> from nltk.text import TokenSearcher
- >>> print('hack'); from nltk.book import text1, text5, text9
- hack...
- >>> text5.findall("<.*><.*><bro>")
- you rule bro; telling you bro; u twizted bro
- >>> text1.findall("<a>(<.*>)<man>")
- monied; nervous; dangerous; white; white; white; pious; queer; good;
- mature; white; Cape; great; wise; wise; butterless; white; fiendish;
- pale; furious; better; certain; complete; dismasted; younger; brave;
- brave; brave; brave
- >>> text9.findall("<th.*>{3,}")
- thread through those; the thought that; that the thing; the thing
- that; that that thing; through these than through; them that the;
- through the thick; them that they; thought that the
- :param regexp: A regular expression
- :type regexp: str
- """
- # preprocess the regular expression
- regexp = re.sub(r"\s", "", regexp)
- regexp = re.sub(r"<", "(?:<(?:", regexp)
- regexp = re.sub(r">", ")>)", regexp)
- regexp = re.sub(r"(?<!\\)\.", "[^>]", regexp)
- # perform the search
- hits = re.findall(regexp, self._raw)
- # Sanity check
- for h in hits:
- if not h.startswith("<") and h.endswith(">"):
- raise ValueError("Bad regexp for TokenSearcher.findall")
- # postprocess the output
- hits = [h[1:-1].split("><") for h in hits]
- return hits
- @python_2_unicode_compatible
- class Text(object):
- """
- A wrapper around a sequence of simple (string) tokens, which is
- intended to support initial exploration of texts (via the
- interactive console). Its methods perform a variety of analyses
- on the text's contexts (e.g., counting, concordancing, collocation
- discovery), and display the results. If you wish to write a
- program which makes use of these analyses, then you should bypass
- the ``Text`` class, and use the appropriate analysis function or
- class directly instead.
- A ``Text`` is typically initialized from a given document or
- corpus. E.g.:
- >>> import nltk.corpus
- >>> from nltk.text import Text
- >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
- """
- # This defeats lazy loading, but makes things faster. This
- # *shouldn't* be necessary because the corpus view *should* be
- # doing intelligent caching, but without this it's running slow.
- # Look into whether the caching is working correctly.
- _COPY_TOKENS = True
- def __init__(self, tokens, name=None):
- """
- Create a Text object.
- :param tokens: The source text.
- :type tokens: sequence of str
- """
- if self._COPY_TOKENS:
- tokens = list(tokens)
- self.tokens = tokens
- if name:
- self.name = name
- elif "]" in tokens[:20]:
- end = tokens[:20].index("]")
- self.name = " ".join(text_type(tok) for tok in tokens[1:end])
- else:
- self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
- # ////////////////////////////////////////////////////////////
- # Support item & slice access
- # ////////////////////////////////////////////////////////////
- def __getitem__(self, i):
- return self.tokens[i]
- def __len__(self):
- return len(self.tokens)
- # ////////////////////////////////////////////////////////////
- # Interactive console methods
- # ////////////////////////////////////////////////////////////
- def concordance(self, word, width=79, lines=25):
- """
- Prints a concordance for ``word`` with the specified context window.
- Word matching is not case-sensitive.
- :param word: The target word
- :type word: str
- :param width: The width of each line, in characters (default=80)
- :type width: int
- :param lines: The number of lines to display (default=25)
- :type lines: int
- :seealso: ``ConcordanceIndex``
- """
- if "_concordance_index" not in self.__dict__:
- self._concordance_index = ConcordanceIndex(
- self.tokens, key=lambda s: s.lower()
- )
- return self._concordance_index.print_concordance(word, width, lines)
- def concordance_list(self, word, width=79, lines=25):
- """
- Generate a concordance for ``word`` with the specified context window.
- Word matching is not case-sensitive.
- :param word: The target word
- :type word: str
- :param width: The width of each line, in characters (default=80)
- :type width: int
- :param lines: The number of lines to display (default=25)
- :type lines: int
- :seealso: ``ConcordanceIndex``
- """
- if "_concordance_index" not in self.__dict__:
- self._concordance_index = ConcordanceIndex(
- self.tokens, key=lambda s: s.lower()
- )
- return self._concordance_index.find_concordance(word, width)[:lines]
- def collocation_list(self, num=20, window_size=2):
- """
- Return collocations derived from the text, ignoring stopwords.
- :param num: The maximum number of collocations to return.
- :type num: int
- :param window_size: The number of tokens spanned by a collocation (default=2)
- :type window_size: int
- """
- if not (
- "_collocations" in self.__dict__
- and self._num == num
- and self._window_size == window_size
- ):
- self._num = num
- self._window_size = window_size
- # print("Building collocations list")
- from nltk.corpus import stopwords
- ignored_words = stopwords.words("english")
- finder = BigramCollocationFinder.from_words(self.tokens, window_size)
- finder.apply_freq_filter(2)
- finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
- bigram_measures = BigramAssocMeasures()
- self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
- return [w1 + " " + w2 for w1, w2 in self._collocations]
- def collocations(self, num=20, window_size=2):
- """
- Print collocations derived from the text, ignoring stopwords.
- :param num: The maximum number of collocations to print.
- :type num: int
- :param window_size: The number of tokens spanned by a collocation (default=2)
- :type window_size: int
- """
- collocation_strings = [
- w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
- ]
- print(tokenwrap(collocation_strings, separator="; "))
- def count(self, word):
- """
- Count the number of times this word appears in the text.
- """
- return self.tokens.count(word)
- def index(self, word):
- """
- Find the index of the first occurrence of the word in the text.
- """
- return self.tokens.index(word)
- def readability(self, method):
- # code from nltk_contrib.readability
- raise NotImplementedError
- def similar(self, word, num=20):
- """
- Distributional similarity: find other words which appear in the
- same contexts as the specified word; list most similar words first.
- :param word: The word used to seed the similarity search
- :type word: str
- :param num: The number of words to generate (default=20)
- :type num: int
- :seealso: ContextIndex.similar_words()
- """
- if "_word_context_index" not in self.__dict__:
- # print('Building word-context index...')
- self._word_context_index = ContextIndex(
- self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
- )
- # words = self._word_context_index.similar_words(word, num)
- word = word.lower()
- wci = self._word_context_index._word_to_contexts
- if word in wci.conditions():
- contexts = set(wci[word])
- fd = Counter(
- w
- for w in wci.conditions()
- for c in wci[w]
- if c in contexts and not w == word
- )
- words = [w for w, _ in fd.most_common(num)]
- print(tokenwrap(words))
- else:
- print("No matches")
- def common_contexts(self, words, num=20):
- """
- Find contexts where the specified words appear; list
- most frequent common contexts first.
- :param words: The words used to seed the similarity search
- :type words: str
- :param num: The number of words to generate (default=20)
- :type num: int
- :seealso: ContextIndex.common_contexts()
- """
- if "_word_context_index" not in self.__dict__:
- # print('Building word-context index...')
- self._word_context_index = ContextIndex(
- self.tokens, key=lambda s: s.lower()
- )
- try:
- fd = self._word_context_index.common_contexts(words, True)
- if not fd:
- print("No common contexts were found")
- else:
- ranked_contexts = [w for w, _ in fd.most_common(num)]
- print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))
- except ValueError as e:
- print(e)
- def dispersion_plot(self, words):
- """
- Produce a plot showing the distribution of the words through the text.
- Requires pylab to be installed.
- :param words: The words to be plotted
- :type words: list(str)
- :seealso: nltk.draw.dispersion_plot()
- """
- from nltk.draw import dispersion_plot
- dispersion_plot(self, words)
- def _train_default_ngram_lm(self, tokenized_sents, n=3):
- train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents)
- model = MLE(order=n)
- model.fit(train_data, padded_sents)
- return model
- def generate(self, length=100, text_seed=None, random_seed=42):
- """
- Print random text, generated using a trigram language model.
- See also `help(nltk.lm)`.
- :param length: The length of text to generate (default=100)
- :type length: int
- :param text_seed: Generation can be conditioned on preceding context.
- :type text_seed: list(str)
- :param random_seed: A random seed or an instance of `random.Random`. If provided,
- makes the random sampling part of generation reproducible. (default=42)
- :type random_seed: int
- """
- # Create the model when using it the first time.
- self._tokenized_sents = [
- sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens))
- ]
- if not hasattr(self, "trigram_model"):
- print("Building ngram index...", file=sys.stderr)
- self._trigram_model = self._train_default_ngram_lm(
- self._tokenized_sents, n=3
- )
- generated_tokens = []
- assert length > 0, "The `length` must be more than 0."
- while len(generated_tokens) < length:
- for idx, token in enumerate(
- self._trigram_model.generate(
- length, text_seed=text_seed, random_seed=random_seed
- )
- ):
- if token == "<s>":
- continue
- if token == "</s>":
- break
- generated_tokens.append(token)
- random_seed += 1
- prefix = " ".join(text_seed) + " " if text_seed else ""
- output_str = prefix + tokenwrap(generated_tokens[:length])
- print(output_str)
- return output_str
- def plot(self, *args):
- """
- See documentation for FreqDist.plot()
- :seealso: nltk.prob.FreqDist.plot()
- """
- self.vocab().plot(*args)
- def vocab(self):
- """
- :seealso: nltk.prob.FreqDist
- """
- if "_vocab" not in self.__dict__:
- # print("Building vocabulary index...")
- self._vocab = FreqDist(self)
- return self._vocab
- def findall(self, regexp):
- """
- Find instances of the regular expression in the text.
- The text is a list of tokens, and a regexp pattern to match
- a single token must be surrounded by angle brackets. E.g.
- >>> print('hack'); from nltk.book import text1, text5, text9
- hack...
- >>> text5.findall("<.*><.*><bro>")
- you rule bro; telling you bro; u twizted bro
- >>> text1.findall("<a>(<.*>)<man>")
- monied; nervous; dangerous; white; white; white; pious; queer; good;
- mature; white; Cape; great; wise; wise; butterless; white; fiendish;
- pale; furious; better; certain; complete; dismasted; younger; brave;
- brave; brave; brave
- >>> text9.findall("<th.*>{3,}")
- thread through those; the thought that; that the thing; the thing
- that; that that thing; through these than through; them that the;
- through the thick; them that they; thought that the
- :param regexp: A regular expression
- :type regexp: str
- """
- if "_token_searcher" not in self.__dict__:
- self._token_searcher = TokenSearcher(self)
- hits = self._token_searcher.findall(regexp)
- hits = [" ".join(h) for h in hits]
- print(tokenwrap(hits, "; "))
- # ////////////////////////////////////////////////////////////
- # Helper Methods
- # ////////////////////////////////////////////////////////////
- _CONTEXT_RE = re.compile("\w+|[\.\!\?]")
- def _context(self, tokens, i):
- """
- One left & one right token, both case-normalized. Skip over
- non-sentence-final punctuation. Used by the ``ContextIndex``
- that is created for ``similar()`` and ``common_contexts()``.
- """
- # Left context
- j = i - 1
- while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
- j -= 1
- left = tokens[j] if j != 0 else "*START*"
- # Right context
- j = i + 1
- while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
- j += 1
- right = tokens[j] if j != len(tokens) else "*END*"
- return (left, right)
- # ////////////////////////////////////////////////////////////
- # String Display
- # ////////////////////////////////////////////////////////////
- def __str__(self):
- return "<Text: %s>" % self.name
- def __repr__(self):
- return "<Text: %s>" % self.name
- # Prototype only; this approach will be slow to load
- class TextCollection(Text):
- """A collection of texts, which can be loaded with list of texts, or
- with a corpus consisting of one or more texts, and which supports
- counting, concordancing, collocation discovery, etc. Initialize a
- TextCollection as follows:
- >>> import nltk.corpus
- >>> from nltk.text import TextCollection
- >>> print('hack'); from nltk.book import text1, text2, text3
- hack...
- >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
- >>> mytexts = TextCollection([text1, text2, text3])
- Iterating over a TextCollection produces all the tokens of all the
- texts in order.
- """
- def __init__(self, source):
- if hasattr(source, "words"): # bridge to the text corpus reader
- source = [source.words(f) for f in source.fileids()]
- self._texts = source
- Text.__init__(self, LazyConcatenation(source))
- self._idf_cache = {}
- def tf(self, term, text):
- """ The frequency of the term in text. """
- return text.count(term) / len(text)
- def idf(self, term):
- """ The number of texts in the corpus divided by the
- number of texts that the term appears in.
- If a term does not appear in the corpus, 0.0 is returned. """
- # idf values are cached for performance.
- idf = self._idf_cache.get(term)
- if idf is None:
- matches = len([True for text in self._texts if term in text])
- if len(self._texts) == 0:
- raise ValueError("IDF undefined for empty document collection")
- idf = log(len(self._texts) / matches) if matches else 0.0
- self._idf_cache[term] = idf
- return idf
- def tf_idf(self, term, text):
- return self.tf(term, text) * self.idf(term)
- def demo():
- from nltk.corpus import brown
- text = Text(brown.words(categories="news"))
- print(text)
- print()
- print("Concordance:")
- text.concordance("news")
- print()
- print("Distributionally similar words:")
- text.similar("news")
- print()
- print("Collocations:")
- text.collocations()
- print()
- # print("Automatically generated text:")
- # text.generate()
- # print()
- print("Dispersion plot:")
- text.dispersion_plot(["news", "report", "said", "announced"])
- print()
- print("Vocabulary plot:")
- text.plot(50)
- print()
- print("Indexing:")
- print("text[3]:", text[3])
- print("text[3:5]:", text[3:5])
- print("text.vocab()['news']:", text.vocab()["news"])
- if __name__ == "__main__":
- demo()
- __all__ = [
- "ContextIndex",
- "ConcordanceIndex",
- "TokenSearcher",
- "Text",
- "TextCollection",
- ]
|