simple.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # Natural Language Toolkit: Simple Tokenizers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.sourceforge.net>
  7. # For license information, see LICENSE.TXT
  8. r"""
  9. Simple Tokenizers
  10. These tokenizers divide strings into substrings using the string
  11. ``split()`` method.
  12. When tokenizing using a particular delimiter string, use
  13. the string ``split()`` method directly, as this is more efficient.
  14. The simple tokenizers are *not* available as separate functions;
  15. instead, you should just use the string ``split()`` method directly:
  16. >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
  17. >>> s.split()
  18. ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
  19. 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
  20. >>> s.split(' ')
  21. ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
  22. 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
  23. >>> s.split('\n')
  24. ['Good muffins cost $3.88', 'in New York. Please buy me',
  25. 'two of them.', '', 'Thanks.']
  26. The simple tokenizers are mainly useful because they follow the
  27. standard ``TokenizerI`` interface, and so can be used with any code
  28. that expects a tokenizer. For example, these tokenizers can be used
  29. to specify the tokenization conventions when building a `CorpusReader`.
  30. """
  31. from __future__ import unicode_literals
  32. from nltk.tokenize.api import TokenizerI, StringTokenizer
  33. from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize
  34. class SpaceTokenizer(StringTokenizer):
  35. r"""Tokenize a string using the space character as a delimiter,
  36. which is the same as ``s.split(' ')``.
  37. >>> from nltk.tokenize import SpaceTokenizer
  38. >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
  39. >>> SpaceTokenizer().tokenize(s)
  40. ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
  41. 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
  42. """
  43. _string = ' '
  44. class TabTokenizer(StringTokenizer):
  45. r"""Tokenize a string use the tab character as a delimiter,
  46. the same as ``s.split('\t')``.
  47. >>> from nltk.tokenize import TabTokenizer
  48. >>> TabTokenizer().tokenize('a\tb c\n\t d')
  49. ['a', 'b c\n', ' d']
  50. """
  51. _string = '\t'
  52. class CharTokenizer(StringTokenizer):
  53. """Tokenize a string into individual characters. If this functionality
  54. is ever required directly, use ``for char in string``.
  55. """
  56. def tokenize(self, s):
  57. return list(s)
  58. def span_tokenize(self, s):
  59. for i, j in enumerate(range(1, len(s) + 1)):
  60. yield i, j
  61. class LineTokenizer(TokenizerI):
  62. r"""Tokenize a string into its lines, optionally discarding blank lines.
  63. This is similar to ``s.split('\n')``.
  64. >>> from nltk.tokenize import LineTokenizer
  65. >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
  66. >>> LineTokenizer(blanklines='keep').tokenize(s)
  67. ['Good muffins cost $3.88', 'in New York. Please buy me',
  68. 'two of them.', '', 'Thanks.']
  69. >>> # same as [l for l in s.split('\n') if l.strip()]:
  70. >>> LineTokenizer(blanklines='discard').tokenize(s)
  71. ['Good muffins cost $3.88', 'in New York. Please buy me',
  72. 'two of them.', 'Thanks.']
  73. :param blanklines: Indicates how blank lines should be handled. Valid values are:
  74. - ``discard``: strip blank lines out of the token list before returning it.
  75. A line is considered blank if it contains only whitespace characters.
  76. - ``keep``: leave all blank lines in the token list.
  77. - ``discard-eof``: if the string ends with a newline, then do not generate
  78. a corresponding token ``''`` after that newline.
  79. """
  80. def __init__(self, blanklines='discard'):
  81. valid_blanklines = ('discard', 'keep', 'discard-eof')
  82. if blanklines not in valid_blanklines:
  83. raise ValueError(
  84. 'Blank lines must be one of: %s' % ' '.join(valid_blanklines)
  85. )
  86. self._blanklines = blanklines
  87. def tokenize(self, s):
  88. lines = s.splitlines()
  89. # If requested, strip off blank lines.
  90. if self._blanklines == 'discard':
  91. lines = [l for l in lines if l.rstrip()]
  92. elif self._blanklines == 'discard-eof':
  93. if lines and not lines[-1].strip():
  94. lines.pop()
  95. return lines
  96. # discard-eof not implemented
  97. def span_tokenize(self, s):
  98. if self._blanklines == 'keep':
  99. for span in string_span_tokenize(s, r'\n'):
  100. yield span
  101. else:
  102. for span in regexp_span_tokenize(s, r'\n(\s+\n)*'):
  103. yield span
  104. ######################################################################
  105. # { Tokenization Functions
  106. ######################################################################
  107. # XXX: it is stated in module docs that there is no function versions
  108. def line_tokenize(text, blanklines='discard'):
  109. return LineTokenizer(blanklines).tokenize(text)