dispersion.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. # Natural Language Toolkit: Dispersion Plots
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A utility for displaying lexical dispersion.
  9. """
  10. def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
  11. """
  12. Generate a lexical dispersion plot.
  13. :param text: The source text
  14. :type text: list(str) or enum(str)
  15. :param words: The target words
  16. :type words: list of str
  17. :param ignore_case: flag to set if case should be ignored when searching text
  18. :type ignore_case: bool
  19. """
  20. try:
  21. from matplotlib import pylab
  22. except ImportError:
  23. raise ValueError(
  24. 'The plot function requires matplotlib to be installed.'
  25. 'See http://matplotlib.org/'
  26. )
  27. text = list(text)
  28. words.reverse()
  29. if ignore_case:
  30. words_to_comp = list(map(str.lower, words))
  31. text_to_comp = list(map(str.lower, text))
  32. else:
  33. words_to_comp = words
  34. text_to_comp = text
  35. points = [
  36. (x, y)
  37. for x in range(len(text_to_comp))
  38. for y in range(len(words_to_comp))
  39. if text_to_comp[x] == words_to_comp[y]
  40. ]
  41. if points:
  42. x, y = list(zip(*points))
  43. else:
  44. x = y = ()
  45. pylab.plot(x, y, "b|", scalex=0.1)
  46. pylab.yticks(list(range(len(words))), words, color="b")
  47. pylab.ylim(-1, len(words))
  48. pylab.title(title)
  49. pylab.xlabel("Word Offset")
  50. pylab.show()
  51. if __name__ == '__main__':
  52. import nltk.compat
  53. from nltk.corpus import gutenberg
  54. words = ['Elinor', 'Marianne', 'Edward', 'Willoughby']
  55. dispersion_plot(gutenberg.words('austen-sense.txt'), words)