text.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764
  1. # Natural Language Toolkit: Texts
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # Edward Loper <edloper@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. This module brings together a variety of NLTK functionality for
  10. text analysis, and provides simple, interactive interfaces.
  11. Functionality includes: concordancing, collocation discovery,
  12. regular expression search over tokenized strings, and
  13. distributional similarity.
  14. """
  15. from __future__ import print_function, division, unicode_literals, absolute_import
  16. from math import log
  17. from collections import defaultdict, Counter, namedtuple
  18. from functools import reduce
  19. import re
  20. import sys
  21. from six import text_type
  22. from nltk.lm import MLE
  23. from nltk.lm.preprocessing import padded_everygram_pipeline
  24. from nltk.probability import FreqDist
  25. from nltk.probability import ConditionalFreqDist as CFD
  26. from nltk.util import tokenwrap, LazyConcatenation
  27. from nltk.metrics import f_measure, BigramAssocMeasures
  28. from nltk.collocations import BigramCollocationFinder
  29. from nltk.compat import python_2_unicode_compatible
  30. from nltk.tokenize import sent_tokenize
  31. ConcordanceLine = namedtuple(
  32. "ConcordanceLine",
  33. ["left", "query", "right", "offset", "left_print", "right_print", "line"],
  34. )
  35. class ContextIndex(object):
  36. """
  37. A bidirectional index between words and their 'contexts' in a text.
  38. The context of a word is usually defined to be the words that occur
  39. in a fixed window around the word; but other definitions may also
  40. be used by providing a custom context function.
  41. """
  42. @staticmethod
  43. def _default_context(tokens, i):
  44. """One left token and one right token, normalized to lowercase"""
  45. left = tokens[i - 1].lower() if i != 0 else "*START*"
  46. right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*"
  47. return (left, right)
  48. def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x):
  49. self._key = key
  50. self._tokens = tokens
  51. if context_func:
  52. self._context_func = context_func
  53. else:
  54. self._context_func = self._default_context
  55. if filter:
  56. tokens = [t for t in tokens if filter(t)]
  57. self._word_to_contexts = CFD(
  58. (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)
  59. )
  60. self._context_to_words = CFD(
  61. (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)
  62. )
  63. def tokens(self):
  64. """
  65. :rtype: list(str)
  66. :return: The document that this context index was
  67. created from.
  68. """
  69. return self._tokens
  70. def word_similarity_dict(self, word):
  71. """
  72. Return a dictionary mapping from words to 'similarity scores,'
  73. indicating how often these two words occur in the same
  74. context.
  75. """
  76. word = self._key(word)
  77. word_contexts = set(self._word_to_contexts[word])
  78. scores = {}
  79. for w, w_contexts in self._word_to_contexts.items():
  80. scores[w] = f_measure(word_contexts, set(w_contexts))
  81. return scores
  82. def similar_words(self, word, n=20):
  83. scores = defaultdict(int)
  84. for c in self._word_to_contexts[self._key(word)]:
  85. for w in self._context_to_words[c]:
  86. if w != word:
  87. scores[w] += (
  88. self._context_to_words[c][word] * self._context_to_words[c][w]
  89. )
  90. return sorted(scores, key=scores.get, reverse=True)[:n]
  91. def common_contexts(self, words, fail_on_unknown=False):
  92. """
  93. Find contexts where the specified words can all appear; and
  94. return a frequency distribution mapping each context to the
  95. number of times that context was used.
  96. :param words: The words used to seed the similarity search
  97. :type words: str
  98. :param fail_on_unknown: If true, then raise a value error if
  99. any of the given words do not occur at all in the index.
  100. """
  101. words = [self._key(w) for w in words]
  102. contexts = [set(self._word_to_contexts[w]) for w in words]
  103. empty = [words[i] for i in range(len(words)) if not contexts[i]]
  104. common = reduce(set.intersection, contexts)
  105. if empty and fail_on_unknown:
  106. raise ValueError("The following word(s) were not found:", " ".join(words))
  107. elif not common:
  108. # nothing in common -- just return an empty freqdist.
  109. return FreqDist()
  110. else:
  111. fd = FreqDist(
  112. c for w in words for c in self._word_to_contexts[w] if c in common
  113. )
  114. return fd
  115. @python_2_unicode_compatible
  116. class ConcordanceIndex(object):
  117. """
  118. An index that can be used to look up the offset locations at which
  119. a given word occurs in a document.
  120. """
  121. def __init__(self, tokens, key=lambda x: x):
  122. """
  123. Construct a new concordance index.
  124. :param tokens: The document (list of tokens) that this
  125. concordance index was created from. This list can be used
  126. to access the context of a given word occurrence.
  127. :param key: A function that maps each token to a normalized
  128. version that will be used as a key in the index. E.g., if
  129. you use ``key=lambda s:s.lower()``, then the index will be
  130. case-insensitive.
  131. """
  132. self._tokens = tokens
  133. """The document (list of tokens) that this concordance index
  134. was created from."""
  135. self._key = key
  136. """Function mapping each token to an index key (or None)."""
  137. self._offsets = defaultdict(list)
  138. """Dictionary mapping words (or keys) to lists of offset indices."""
  139. # Initialize the index (self._offsets)
  140. for index, word in enumerate(tokens):
  141. word = self._key(word)
  142. self._offsets[word].append(index)
  143. def tokens(self):
  144. """
  145. :rtype: list(str)
  146. :return: The document that this concordance index was
  147. created from.
  148. """
  149. return self._tokens
  150. def offsets(self, word):
  151. """
  152. :rtype: list(int)
  153. :return: A list of the offset positions at which the given
  154. word occurs. If a key function was specified for the
  155. index, then given word's key will be looked up.
  156. """
  157. word = self._key(word)
  158. return self._offsets[word]
  159. def __repr__(self):
  160. return "<ConcordanceIndex for %d tokens (%d types)>" % (
  161. len(self._tokens),
  162. len(self._offsets),
  163. )
  164. def find_concordance(self, word, width=80):
  165. """
  166. Find all concordance lines given the query word.
  167. """
  168. half_width = (width - len(word) - 2) // 2
  169. context = width // 4 # approx number of words of context
  170. # Find the instances of the word to create the ConcordanceLine
  171. concordance_list = []
  172. offsets = self.offsets(word)
  173. if offsets:
  174. for i in offsets:
  175. query_word = self._tokens[i]
  176. # Find the context of query word.
  177. left_context = self._tokens[max(0, i - context) : i]
  178. right_context = self._tokens[i + 1 : i + context]
  179. # Create the pretty lines with the query_word in the middle.
  180. left_print = " ".join(left_context)[-half_width:]
  181. right_print = " ".join(right_context)[:half_width]
  182. # The WYSIWYG line of the concordance.
  183. line_print = " ".join([left_print, query_word, right_print])
  184. # Create the ConcordanceLine
  185. concordance_line = ConcordanceLine(
  186. left_context,
  187. query_word,
  188. right_context,
  189. i,
  190. left_print,
  191. right_print,
  192. line_print,
  193. )
  194. concordance_list.append(concordance_line)
  195. return concordance_list
  196. def print_concordance(self, word, width=80, lines=25):
  197. """
  198. Print concordance lines given the query word.
  199. :param word: The target word
  200. :type word: str
  201. :param lines: The number of lines to display (default=25)
  202. :type lines: int
  203. :param width: The width of each line, in characters (default=80)
  204. :type width: int
  205. :param save: The option to save the concordance.
  206. :type save: bool
  207. """
  208. concordance_list = self.find_concordance(word, width=width)
  209. if not concordance_list:
  210. print("no matches")
  211. else:
  212. lines = min(lines, len(concordance_list))
  213. print("Displaying {} of {} matches:".format(lines, len(concordance_list)))
  214. for i, concordance_line in enumerate(concordance_list[:lines]):
  215. print(concordance_line.line)
  216. class TokenSearcher(object):
  217. """
  218. A class that makes it easier to use regular expressions to search
  219. over tokenized strings. The tokenized string is converted to a
  220. string where tokens are marked with angle brackets -- e.g.,
  221. ``'<the><window><is><still><open>'``. The regular expression
  222. passed to the ``findall()`` method is modified to treat angle
  223. brackets as non-capturing parentheses, in addition to matching the
  224. token boundaries; and to have ``'.'`` not match the angle brackets.
  225. """
  226. def __init__(self, tokens):
  227. self._raw = "".join("<" + w + ">" for w in tokens)
  228. def findall(self, regexp):
  229. """
  230. Find instances of the regular expression in the text.
  231. The text is a list of tokens, and a regexp pattern to match
  232. a single token must be surrounded by angle brackets. E.g.
  233. >>> from nltk.text import TokenSearcher
  234. >>> print('hack'); from nltk.book import text1, text5, text9
  235. hack...
  236. >>> text5.findall("<.*><.*><bro>")
  237. you rule bro; telling you bro; u twizted bro
  238. >>> text1.findall("<a>(<.*>)<man>")
  239. monied; nervous; dangerous; white; white; white; pious; queer; good;
  240. mature; white; Cape; great; wise; wise; butterless; white; fiendish;
  241. pale; furious; better; certain; complete; dismasted; younger; brave;
  242. brave; brave; brave
  243. >>> text9.findall("<th.*>{3,}")
  244. thread through those; the thought that; that the thing; the thing
  245. that; that that thing; through these than through; them that the;
  246. through the thick; them that they; thought that the
  247. :param regexp: A regular expression
  248. :type regexp: str
  249. """
  250. # preprocess the regular expression
  251. regexp = re.sub(r"\s", "", regexp)
  252. regexp = re.sub(r"<", "(?:<(?:", regexp)
  253. regexp = re.sub(r">", ")>)", regexp)
  254. regexp = re.sub(r"(?<!\\)\.", "[^>]", regexp)
  255. # perform the search
  256. hits = re.findall(regexp, self._raw)
  257. # Sanity check
  258. for h in hits:
  259. if not h.startswith("<") and h.endswith(">"):
  260. raise ValueError("Bad regexp for TokenSearcher.findall")
  261. # postprocess the output
  262. hits = [h[1:-1].split("><") for h in hits]
  263. return hits
  264. @python_2_unicode_compatible
  265. class Text(object):
  266. """
  267. A wrapper around a sequence of simple (string) tokens, which is
  268. intended to support initial exploration of texts (via the
  269. interactive console). Its methods perform a variety of analyses
  270. on the text's contexts (e.g., counting, concordancing, collocation
  271. discovery), and display the results. If you wish to write a
  272. program which makes use of these analyses, then you should bypass
  273. the ``Text`` class, and use the appropriate analysis function or
  274. class directly instead.
  275. A ``Text`` is typically initialized from a given document or
  276. corpus. E.g.:
  277. >>> import nltk.corpus
  278. >>> from nltk.text import Text
  279. >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt'))
  280. """
  281. # This defeats lazy loading, but makes things faster. This
  282. # *shouldn't* be necessary because the corpus view *should* be
  283. # doing intelligent caching, but without this it's running slow.
  284. # Look into whether the caching is working correctly.
  285. _COPY_TOKENS = True
  286. def __init__(self, tokens, name=None):
  287. """
  288. Create a Text object.
  289. :param tokens: The source text.
  290. :type tokens: sequence of str
  291. """
  292. if self._COPY_TOKENS:
  293. tokens = list(tokens)
  294. self.tokens = tokens
  295. if name:
  296. self.name = name
  297. elif "]" in tokens[:20]:
  298. end = tokens[:20].index("]")
  299. self.name = " ".join(text_type(tok) for tok in tokens[1:end])
  300. else:
  301. self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
  302. # ////////////////////////////////////////////////////////////
  303. # Support item & slice access
  304. # ////////////////////////////////////////////////////////////
  305. def __getitem__(self, i):
  306. return self.tokens[i]
  307. def __len__(self):
  308. return len(self.tokens)
  309. # ////////////////////////////////////////////////////////////
  310. # Interactive console methods
  311. # ////////////////////////////////////////////////////////////
  312. def concordance(self, word, width=79, lines=25):
  313. """
  314. Prints a concordance for ``word`` with the specified context window.
  315. Word matching is not case-sensitive.
  316. :param word: The target word
  317. :type word: str
  318. :param width: The width of each line, in characters (default=80)
  319. :type width: int
  320. :param lines: The number of lines to display (default=25)
  321. :type lines: int
  322. :seealso: ``ConcordanceIndex``
  323. """
  324. if "_concordance_index" not in self.__dict__:
  325. self._concordance_index = ConcordanceIndex(
  326. self.tokens, key=lambda s: s.lower()
  327. )
  328. return self._concordance_index.print_concordance(word, width, lines)
  329. def concordance_list(self, word, width=79, lines=25):
  330. """
  331. Generate a concordance for ``word`` with the specified context window.
  332. Word matching is not case-sensitive.
  333. :param word: The target word
  334. :type word: str
  335. :param width: The width of each line, in characters (default=80)
  336. :type width: int
  337. :param lines: The number of lines to display (default=25)
  338. :type lines: int
  339. :seealso: ``ConcordanceIndex``
  340. """
  341. if "_concordance_index" not in self.__dict__:
  342. self._concordance_index = ConcordanceIndex(
  343. self.tokens, key=lambda s: s.lower()
  344. )
  345. return self._concordance_index.find_concordance(word, width)[:lines]
  346. def collocation_list(self, num=20, window_size=2):
  347. """
  348. Return collocations derived from the text, ignoring stopwords.
  349. :param num: The maximum number of collocations to return.
  350. :type num: int
  351. :param window_size: The number of tokens spanned by a collocation (default=2)
  352. :type window_size: int
  353. """
  354. if not (
  355. "_collocations" in self.__dict__
  356. and self._num == num
  357. and self._window_size == window_size
  358. ):
  359. self._num = num
  360. self._window_size = window_size
  361. # print("Building collocations list")
  362. from nltk.corpus import stopwords
  363. ignored_words = stopwords.words("english")
  364. finder = BigramCollocationFinder.from_words(self.tokens, window_size)
  365. finder.apply_freq_filter(2)
  366. finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
  367. bigram_measures = BigramAssocMeasures()
  368. self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
  369. return [w1 + " " + w2 for w1, w2 in self._collocations]
  370. def collocations(self, num=20, window_size=2):
  371. """
  372. Print collocations derived from the text, ignoring stopwords.
  373. :param num: The maximum number of collocations to print.
  374. :type num: int
  375. :param window_size: The number of tokens spanned by a collocation (default=2)
  376. :type window_size: int
  377. """
  378. collocation_strings = [
  379. w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
  380. ]
  381. print(tokenwrap(collocation_strings, separator="; "))
  382. def count(self, word):
  383. """
  384. Count the number of times this word appears in the text.
  385. """
  386. return self.tokens.count(word)
  387. def index(self, word):
  388. """
  389. Find the index of the first occurrence of the word in the text.
  390. """
  391. return self.tokens.index(word)
  392. def readability(self, method):
  393. # code from nltk_contrib.readability
  394. raise NotImplementedError
  395. def similar(self, word, num=20):
  396. """
  397. Distributional similarity: find other words which appear in the
  398. same contexts as the specified word; list most similar words first.
  399. :param word: The word used to seed the similarity search
  400. :type word: str
  401. :param num: The number of words to generate (default=20)
  402. :type num: int
  403. :seealso: ContextIndex.similar_words()
  404. """
  405. if "_word_context_index" not in self.__dict__:
  406. # print('Building word-context index...')
  407. self._word_context_index = ContextIndex(
  408. self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
  409. )
  410. # words = self._word_context_index.similar_words(word, num)
  411. word = word.lower()
  412. wci = self._word_context_index._word_to_contexts
  413. if word in wci.conditions():
  414. contexts = set(wci[word])
  415. fd = Counter(
  416. w
  417. for w in wci.conditions()
  418. for c in wci[w]
  419. if c in contexts and not w == word
  420. )
  421. words = [w for w, _ in fd.most_common(num)]
  422. print(tokenwrap(words))
  423. else:
  424. print("No matches")
  425. def common_contexts(self, words, num=20):
  426. """
  427. Find contexts where the specified words appear; list
  428. most frequent common contexts first.
  429. :param words: The words used to seed the similarity search
  430. :type words: str
  431. :param num: The number of words to generate (default=20)
  432. :type num: int
  433. :seealso: ContextIndex.common_contexts()
  434. """
  435. if "_word_context_index" not in self.__dict__:
  436. # print('Building word-context index...')
  437. self._word_context_index = ContextIndex(
  438. self.tokens, key=lambda s: s.lower()
  439. )
  440. try:
  441. fd = self._word_context_index.common_contexts(words, True)
  442. if not fd:
  443. print("No common contexts were found")
  444. else:
  445. ranked_contexts = [w for w, _ in fd.most_common(num)]
  446. print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))
  447. except ValueError as e:
  448. print(e)
  449. def dispersion_plot(self, words):
  450. """
  451. Produce a plot showing the distribution of the words through the text.
  452. Requires pylab to be installed.
  453. :param words: The words to be plotted
  454. :type words: list(str)
  455. :seealso: nltk.draw.dispersion_plot()
  456. """
  457. from nltk.draw import dispersion_plot
  458. dispersion_plot(self, words)
  459. def _train_default_ngram_lm(self, tokenized_sents, n=3):
  460. train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents)
  461. model = MLE(order=n)
  462. model.fit(train_data, padded_sents)
  463. return model
  464. def generate(self, length=100, text_seed=None, random_seed=42):
  465. """
  466. Print random text, generated using a trigram language model.
  467. See also `help(nltk.lm)`.
  468. :param length: The length of text to generate (default=100)
  469. :type length: int
  470. :param text_seed: Generation can be conditioned on preceding context.
  471. :type text_seed: list(str)
  472. :param random_seed: A random seed or an instance of `random.Random`. If provided,
  473. makes the random sampling part of generation reproducible. (default=42)
  474. :type random_seed: int
  475. """
  476. # Create the model when using it the first time.
  477. self._tokenized_sents = [
  478. sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens))
  479. ]
  480. if not hasattr(self, "trigram_model"):
  481. print("Building ngram index...", file=sys.stderr)
  482. self._trigram_model = self._train_default_ngram_lm(
  483. self._tokenized_sents, n=3
  484. )
  485. generated_tokens = []
  486. assert length > 0, "The `length` must be more than 0."
  487. while len(generated_tokens) < length:
  488. for idx, token in enumerate(
  489. self._trigram_model.generate(
  490. length, text_seed=text_seed, random_seed=random_seed
  491. )
  492. ):
  493. if token == "<s>":
  494. continue
  495. if token == "</s>":
  496. break
  497. generated_tokens.append(token)
  498. random_seed += 1
  499. prefix = " ".join(text_seed) + " " if text_seed else ""
  500. output_str = prefix + tokenwrap(generated_tokens[:length])
  501. print(output_str)
  502. return output_str
  503. def plot(self, *args):
  504. """
  505. See documentation for FreqDist.plot()
  506. :seealso: nltk.prob.FreqDist.plot()
  507. """
  508. self.vocab().plot(*args)
  509. def vocab(self):
  510. """
  511. :seealso: nltk.prob.FreqDist
  512. """
  513. if "_vocab" not in self.__dict__:
  514. # print("Building vocabulary index...")
  515. self._vocab = FreqDist(self)
  516. return self._vocab
  517. def findall(self, regexp):
  518. """
  519. Find instances of the regular expression in the text.
  520. The text is a list of tokens, and a regexp pattern to match
  521. a single token must be surrounded by angle brackets. E.g.
  522. >>> print('hack'); from nltk.book import text1, text5, text9
  523. hack...
  524. >>> text5.findall("<.*><.*><bro>")
  525. you rule bro; telling you bro; u twizted bro
  526. >>> text1.findall("<a>(<.*>)<man>")
  527. monied; nervous; dangerous; white; white; white; pious; queer; good;
  528. mature; white; Cape; great; wise; wise; butterless; white; fiendish;
  529. pale; furious; better; certain; complete; dismasted; younger; brave;
  530. brave; brave; brave
  531. >>> text9.findall("<th.*>{3,}")
  532. thread through those; the thought that; that the thing; the thing
  533. that; that that thing; through these than through; them that the;
  534. through the thick; them that they; thought that the
  535. :param regexp: A regular expression
  536. :type regexp: str
  537. """
  538. if "_token_searcher" not in self.__dict__:
  539. self._token_searcher = TokenSearcher(self)
  540. hits = self._token_searcher.findall(regexp)
  541. hits = [" ".join(h) for h in hits]
  542. print(tokenwrap(hits, "; "))
  543. # ////////////////////////////////////////////////////////////
  544. # Helper Methods
  545. # ////////////////////////////////////////////////////////////
  546. _CONTEXT_RE = re.compile("\w+|[\.\!\?]")
  547. def _context(self, tokens, i):
  548. """
  549. One left & one right token, both case-normalized. Skip over
  550. non-sentence-final punctuation. Used by the ``ContextIndex``
  551. that is created for ``similar()`` and ``common_contexts()``.
  552. """
  553. # Left context
  554. j = i - 1
  555. while j >= 0 and not self._CONTEXT_RE.match(tokens[j]):
  556. j -= 1
  557. left = tokens[j] if j != 0 else "*START*"
  558. # Right context
  559. j = i + 1
  560. while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
  561. j += 1
  562. right = tokens[j] if j != len(tokens) else "*END*"
  563. return (left, right)
  564. # ////////////////////////////////////////////////////////////
  565. # String Display
  566. # ////////////////////////////////////////////////////////////
  567. def __str__(self):
  568. return "<Text: %s>" % self.name
  569. def __repr__(self):
  570. return "<Text: %s>" % self.name
  571. # Prototype only; this approach will be slow to load
  572. class TextCollection(Text):
  573. """A collection of texts, which can be loaded with list of texts, or
  574. with a corpus consisting of one or more texts, and which supports
  575. counting, concordancing, collocation discovery, etc. Initialize a
  576. TextCollection as follows:
  577. >>> import nltk.corpus
  578. >>> from nltk.text import TextCollection
  579. >>> print('hack'); from nltk.book import text1, text2, text3
  580. hack...
  581. >>> gutenberg = TextCollection(nltk.corpus.gutenberg)
  582. >>> mytexts = TextCollection([text1, text2, text3])
  583. Iterating over a TextCollection produces all the tokens of all the
  584. texts in order.
  585. """
  586. def __init__(self, source):
  587. if hasattr(source, "words"): # bridge to the text corpus reader
  588. source = [source.words(f) for f in source.fileids()]
  589. self._texts = source
  590. Text.__init__(self, LazyConcatenation(source))
  591. self._idf_cache = {}
  592. def tf(self, term, text):
  593. """ The frequency of the term in text. """
  594. return text.count(term) / len(text)
  595. def idf(self, term):
  596. """ The number of texts in the corpus divided by the
  597. number of texts that the term appears in.
  598. If a term does not appear in the corpus, 0.0 is returned. """
  599. # idf values are cached for performance.
  600. idf = self._idf_cache.get(term)
  601. if idf is None:
  602. matches = len([True for text in self._texts if term in text])
  603. if len(self._texts) == 0:
  604. raise ValueError("IDF undefined for empty document collection")
  605. idf = log(len(self._texts) / matches) if matches else 0.0
  606. self._idf_cache[term] = idf
  607. return idf
  608. def tf_idf(self, term, text):
  609. return self.tf(term, text) * self.idf(term)
  610. def demo():
  611. from nltk.corpus import brown
  612. text = Text(brown.words(categories="news"))
  613. print(text)
  614. print()
  615. print("Concordance:")
  616. text.concordance("news")
  617. print()
  618. print("Distributionally similar words:")
  619. text.similar("news")
  620. print()
  621. print("Collocations:")
  622. text.collocations()
  623. print()
  624. # print("Automatically generated text:")
  625. # text.generate()
  626. # print()
  627. print("Dispersion plot:")
  628. text.dispersion_plot(["news", "report", "said", "announced"])
  629. print()
  630. print("Vocabulary plot:")
  631. text.plot(50)
  632. print()
  633. print("Indexing:")
  634. print("text[3]:", text[3])
  635. print("text[3:5]:", text[3:5])
  636. print("text.vocab()['news']:", text.vocab()["news"])
  637. if __name__ == "__main__":
  638. demo()
  639. __all__ = [
  640. "ContextIndex",
  641. "ConcordanceIndex",
  642. "TokenSearcher",
  643. "Text",
  644. "TextCollection",
  645. ]