sexpr.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # Natural Language Toolkit: Tokenizers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
  5. # Steven Bird <stevenbird1@gmail.com> (minor edits)
  6. # URL: <http://nltk.sourceforge.net>
  7. # For license information, see LICENSE.TXT
  8. """
  9. S-Expression Tokenizer
  10. ``SExprTokenizer`` is used to find parenthesized expressions in a
  11. string. In particular, it divides a string into a sequence of
  12. substrings that are either parenthesized expressions (including any
  13. nested parenthesized expressions), or other whitespace-separated
  14. tokens.
  15. >>> from nltk.tokenize import SExprTokenizer
  16. >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
  17. ['(a b (c d))', 'e', 'f', '(g)']
  18. By default, `SExprTokenizer` will raise a ``ValueError`` exception if
  19. used to tokenize an expression with non-matching parentheses:
  20. >>> SExprTokenizer().tokenize('c) d) e (f (g')
  21. Traceback (most recent call last):
  22. ...
  23. ValueError: Un-matched close paren at char 1
  24. The ``strict`` argument can be set to False to allow for
  25. non-matching parentheses. Any unmatched close parentheses will be
  26. listed as their own s-expression; and the last partial sexpr with
  27. unmatched open parentheses will be listed as its own sexpr:
  28. >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
  29. ['c', ')', 'd', ')', 'e', '(f (g']
  30. The characters used for open and close parentheses may be customized
  31. using the ``parens`` argument to the `SExprTokenizer` constructor:
  32. >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
  33. ['{a b {c d}}', 'e', 'f', '{g}']
  34. The s-expression tokenizer is also available as a function:
  35. >>> from nltk.tokenize import sexpr_tokenize
  36. >>> sexpr_tokenize('(a b (c d)) e f (g)')
  37. ['(a b (c d))', 'e', 'f', '(g)']
  38. """
  39. import re
  40. from nltk.tokenize.api import TokenizerI
  41. class SExprTokenizer(TokenizerI):
  42. """
  43. A tokenizer that divides strings into s-expressions.
  44. An s-expresion can be either:
  45. - a parenthesized expression, including any nested parenthesized
  46. expressions, or
  47. - a sequence of non-whitespace non-parenthesis characters.
  48. For example, the string ``(a (b c)) d e (f)`` consists of four
  49. s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
  50. By default, the characters ``(`` and ``)`` are treated as open and
  51. close parentheses, but alternative strings may be specified.
  52. :param parens: A two-element sequence specifying the open and close parentheses
  53. that should be used to find sexprs. This will typically be either a
  54. two-character string, or a list of two strings.
  55. :type parens: str or list
  56. :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
  57. """
  58. def __init__(self, parens='()', strict=True):
  59. if len(parens) != 2:
  60. raise ValueError('parens must contain exactly two strings')
  61. self._strict = strict
  62. self._open_paren = parens[0]
  63. self._close_paren = parens[1]
  64. self._paren_regexp = re.compile(
  65. '%s|%s' % (re.escape(parens[0]), re.escape(parens[1]))
  66. )
  67. def tokenize(self, text):
  68. """
  69. Return a list of s-expressions extracted from *text*.
  70. For example:
  71. >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
  72. ['(a b (c d))', 'e', 'f', '(g)']
  73. All parentheses are assumed to mark s-expressions.
  74. (No special processing is done to exclude parentheses that occur
  75. inside strings, or following backslash characters.)
  76. If the given expression contains non-matching parentheses,
  77. then the behavior of the tokenizer depends on the ``strict``
  78. parameter to the constructor. If ``strict`` is ``True``, then
  79. raise a ``ValueError``. If ``strict`` is ``False``, then any
  80. unmatched close parentheses will be listed as their own
  81. s-expression; and the last partial s-expression with unmatched open
  82. parentheses will be listed as its own s-expression:
  83. >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
  84. ['c', ')', 'd', ')', 'e', '(f (g']
  85. :param text: the string to be tokenized
  86. :type text: str or iter(str)
  87. :rtype: iter(str)
  88. """
  89. result = []
  90. pos = 0
  91. depth = 0
  92. for m in self._paren_regexp.finditer(text):
  93. paren = m.group()
  94. if depth == 0:
  95. result += text[pos : m.start()].split()
  96. pos = m.start()
  97. if paren == self._open_paren:
  98. depth += 1
  99. if paren == self._close_paren:
  100. if self._strict and depth == 0:
  101. raise ValueError('Un-matched close paren at char %d' % m.start())
  102. depth = max(0, depth - 1)
  103. if depth == 0:
  104. result.append(text[pos : m.end()])
  105. pos = m.end()
  106. if self._strict and depth > 0:
  107. raise ValueError('Un-matched open paren at char %d' % pos)
  108. if pos < len(text):
  109. result.append(text[pos:])
  110. return result
  111. sexpr_tokenize = SExprTokenizer().tokenize