12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- # Natural Language Toolkit: Dispersion Plots
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A utility for displaying lexical dispersion.
- """
- def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
- """
- Generate a lexical dispersion plot.
- :param text: The source text
- :type text: list(str) or enum(str)
- :param words: The target words
- :type words: list of str
- :param ignore_case: flag to set if case should be ignored when searching text
- :type ignore_case: bool
- """
- try:
- from matplotlib import pylab
- except ImportError:
- raise ValueError(
- 'The plot function requires matplotlib to be installed.'
- 'See http://matplotlib.org/'
- )
- text = list(text)
- words.reverse()
- if ignore_case:
- words_to_comp = list(map(str.lower, words))
- text_to_comp = list(map(str.lower, text))
- else:
- words_to_comp = words
- text_to_comp = text
- points = [
- (x, y)
- for x in range(len(text_to_comp))
- for y in range(len(words_to_comp))
- if text_to_comp[x] == words_to_comp[y]
- ]
- if points:
- x, y = list(zip(*points))
- else:
- x = y = ()
- pylab.plot(x, y, "b|", scalex=0.1)
- pylab.yticks(list(range(len(words))), words, color="b")
- pylab.ylim(-1, len(words))
- pylab.title(title)
- pylab.xlabel("Word Offset")
- pylab.show()
- if __name__ == '__main__':
- import nltk.compat
- from nltk.corpus import gutenberg
- words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
- dispersion_plot(gutenberg.words('austen-sense.txt'), words)
|