shiftreduce.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. # Natural Language Toolkit: Shift-Reduce Parser
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import print_function, unicode_literals
  9. from nltk.grammar import Nonterminal
  10. from nltk.tree import Tree
  11. from nltk.compat import unicode_repr
  12. from nltk.parse.api import ParserI
  13. ##//////////////////////////////////////////////////////
  14. ## Shift/Reduce Parser
  15. ##//////////////////////////////////////////////////////
  16. class ShiftReduceParser(ParserI):
  17. """
  18. A simple bottom-up CFG parser that uses two operations, "shift"
  19. and "reduce", to find a single parse for a text.
  20. ``ShiftReduceParser`` maintains a stack, which records the
  21. structure of a portion of the text. This stack is a list of
  22. strings and Trees that collectively cover a portion of
  23. the text. For example, while parsing the sentence "the dog saw
  24. the man" with a typical grammar, ``ShiftReduceParser`` will produce
  25. the following stack, which covers "the dog saw"::
  26. [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
  27. ``ShiftReduceParser`` attempts to extend the stack to cover the
  28. entire text, and to combine the stack elements into a single tree,
  29. producing a complete parse for the sentence.
  30. Initially, the stack is empty. It is extended to cover the text,
  31. from left to right, by repeatedly applying two operations:
  32. - "shift" moves a token from the beginning of the text to the
  33. end of the stack.
  34. - "reduce" uses a CFG production to combine the rightmost stack
  35. elements into a single Tree.
  36. Often, more than one operation can be performed on a given stack.
  37. In this case, ``ShiftReduceParser`` uses the following heuristics
  38. to decide which operation to perform:
  39. - Only shift if no reductions are available.
  40. - If multiple reductions are available, then apply the reduction
  41. whose CFG production is listed earliest in the grammar.
  42. Note that these heuristics are not guaranteed to choose an
  43. operation that leads to a parse of the text. Also, if multiple
  44. parses exists, ``ShiftReduceParser`` will return at most one of
  45. them.
  46. :see: ``nltk.grammar``
  47. """
  48. def __init__(self, grammar, trace=0):
  49. """
  50. Create a new ``ShiftReduceParser``, that uses ``grammar`` to
  51. parse texts.
  52. :type grammar: Grammar
  53. :param grammar: The grammar used to parse texts.
  54. :type trace: int
  55. :param trace: The level of tracing that should be used when
  56. parsing a text. ``0`` will generate no tracing output;
  57. and higher numbers will produce more verbose tracing
  58. output.
  59. """
  60. self._grammar = grammar
  61. self._trace = trace
  62. self._check_grammar()
  63. def grammar(self):
  64. return self._grammar
  65. def parse(self, tokens):
  66. tokens = list(tokens)
  67. self._grammar.check_coverage(tokens)
  68. # initialize the stack.
  69. stack = []
  70. remaining_text = tokens
  71. # Trace output.
  72. if self._trace:
  73. print('Parsing %r' % " ".join(tokens))
  74. self._trace_stack(stack, remaining_text)
  75. # iterate through the text, pushing the token onto
  76. # the stack, then reducing the stack.
  77. while len(remaining_text) > 0:
  78. self._shift(stack, remaining_text)
  79. while self._reduce(stack, remaining_text):
  80. pass
  81. # Did we reduce everything?
  82. if len(stack) == 1:
  83. # Did we end up with the right category?
  84. if stack[0].label() == self._grammar.start().symbol():
  85. yield stack[0]
  86. def _shift(self, stack, remaining_text):
  87. """
  88. Move a token from the beginning of ``remaining_text`` to the
  89. end of ``stack``.
  90. :type stack: list(str and Tree)
  91. :param stack: A list of strings and Trees, encoding
  92. the structure of the text that has been parsed so far.
  93. :type remaining_text: list(str)
  94. :param remaining_text: The portion of the text that is not yet
  95. covered by ``stack``.
  96. :rtype: None
  97. """
  98. stack.append(remaining_text[0])
  99. remaining_text.remove(remaining_text[0])
  100. if self._trace:
  101. self._trace_shift(stack, remaining_text)
  102. def _match_rhs(self, rhs, rightmost_stack):
  103. """
  104. :rtype: bool
  105. :return: true if the right hand side of a CFG production
  106. matches the rightmost elements of the stack. ``rhs``
  107. matches ``rightmost_stack`` if they are the same length,
  108. and each element of ``rhs`` matches the corresponding
  109. element of ``rightmost_stack``. A nonterminal element of
  110. ``rhs`` matches any Tree whose node value is equal
  111. to the nonterminal's symbol. A terminal element of ``rhs``
  112. matches any string whose type is equal to the terminal.
  113. :type rhs: list(terminal and Nonterminal)
  114. :param rhs: The right hand side of a CFG production.
  115. :type rightmost_stack: list(string and Tree)
  116. :param rightmost_stack: The rightmost elements of the parser's
  117. stack.
  118. """
  119. if len(rightmost_stack) != len(rhs):
  120. return False
  121. for i in range(len(rightmost_stack)):
  122. if isinstance(rightmost_stack[i], Tree):
  123. if not isinstance(rhs[i], Nonterminal):
  124. return False
  125. if rightmost_stack[i].label() != rhs[i].symbol():
  126. return False
  127. else:
  128. if isinstance(rhs[i], Nonterminal):
  129. return False
  130. if rightmost_stack[i] != rhs[i]:
  131. return False
  132. return True
  133. def _reduce(self, stack, remaining_text, production=None):
  134. """
  135. Find a CFG production whose right hand side matches the
  136. rightmost stack elements; and combine those stack elements
  137. into a single Tree, with the node specified by the
  138. production's left-hand side. If more than one CFG production
  139. matches the stack, then use the production that is listed
  140. earliest in the grammar. The new Tree replaces the
  141. elements in the stack.
  142. :rtype: Production or None
  143. :return: If a reduction is performed, then return the CFG
  144. production that the reduction is based on; otherwise,
  145. return false.
  146. :type stack: list(string and Tree)
  147. :param stack: A list of strings and Trees, encoding
  148. the structure of the text that has been parsed so far.
  149. :type remaining_text: list(str)
  150. :param remaining_text: The portion of the text that is not yet
  151. covered by ``stack``.
  152. """
  153. if production is None:
  154. productions = self._grammar.productions()
  155. else:
  156. productions = [production]
  157. # Try each production, in order.
  158. for production in productions:
  159. rhslen = len(production.rhs())
  160. # check if the RHS of a production matches the top of the stack
  161. if self._match_rhs(production.rhs(), stack[-rhslen:]):
  162. # combine the tree to reflect the reduction
  163. tree = Tree(production.lhs().symbol(), stack[-rhslen:])
  164. stack[-rhslen:] = [tree]
  165. # We reduced something
  166. if self._trace:
  167. self._trace_reduce(stack, production, remaining_text)
  168. return production
  169. # We didn't reduce anything
  170. return None
  171. def trace(self, trace=2):
  172. """
  173. Set the level of tracing output that should be generated when
  174. parsing a text.
  175. :type trace: int
  176. :param trace: The trace level. A trace level of ``0`` will
  177. generate no tracing output; and higher trace levels will
  178. produce more verbose tracing output.
  179. :rtype: None
  180. """
  181. # 1: just show shifts.
  182. # 2: show shifts & reduces
  183. # 3: display which tokens & productions are shifed/reduced
  184. self._trace = trace
  185. def _trace_stack(self, stack, remaining_text, marker=' '):
  186. """
  187. Print trace output displaying the given stack and text.
  188. :rtype: None
  189. :param marker: A character that is printed to the left of the
  190. stack. This is used with trace level 2 to print 'S'
  191. before shifted stacks and 'R' before reduced stacks.
  192. """
  193. s = ' ' + marker + ' [ '
  194. for elt in stack:
  195. if isinstance(elt, Tree):
  196. s += unicode_repr(Nonterminal(elt.label())) + ' '
  197. else:
  198. s += unicode_repr(elt) + ' '
  199. s += '* ' + ' '.join(remaining_text) + ']'
  200. print(s)
  201. def _trace_shift(self, stack, remaining_text):
  202. """
  203. Print trace output displaying that a token has been shifted.
  204. :rtype: None
  205. """
  206. if self._trace > 2:
  207. print('Shift %r:' % stack[-1])
  208. if self._trace == 2:
  209. self._trace_stack(stack, remaining_text, 'S')
  210. elif self._trace > 0:
  211. self._trace_stack(stack, remaining_text)
  212. def _trace_reduce(self, stack, production, remaining_text):
  213. """
  214. Print trace output displaying that ``production`` was used to
  215. reduce ``stack``.
  216. :rtype: None
  217. """
  218. if self._trace > 2:
  219. rhs = " ".join(production.rhs())
  220. print('Reduce %r <- %s' % (production.lhs(), rhs))
  221. if self._trace == 2:
  222. self._trace_stack(stack, remaining_text, 'R')
  223. elif self._trace > 1:
  224. self._trace_stack(stack, remaining_text)
  225. def _check_grammar(self):
  226. """
  227. Check to make sure that all of the CFG productions are
  228. potentially useful. If any productions can never be used,
  229. then print a warning.
  230. :rtype: None
  231. """
  232. productions = self._grammar.productions()
  233. # Any production whose RHS is an extension of another production's RHS
  234. # will never be used.
  235. for i in range(len(productions)):
  236. for j in range(i + 1, len(productions)):
  237. rhs1 = productions[i].rhs()
  238. rhs2 = productions[j].rhs()
  239. if rhs1[: len(rhs2)] == rhs2:
  240. print('Warning: %r will never be used' % productions[i])
  241. ##//////////////////////////////////////////////////////
  242. ## Stepping Shift/Reduce Parser
  243. ##//////////////////////////////////////////////////////
  244. class SteppingShiftReduceParser(ShiftReduceParser):
  245. """
  246. A ``ShiftReduceParser`` that allows you to setp through the parsing
  247. process, performing a single operation at a time. It also allows
  248. you to change the parser's grammar midway through parsing a text.
  249. The ``initialize`` method is used to start parsing a text.
  250. ``shift`` performs a single shift operation, and ``reduce`` performs
  251. a single reduce operation. ``step`` will perform a single reduce
  252. operation if possible; otherwise, it will perform a single shift
  253. operation. ``parses`` returns the set of parses that have been
  254. found by the parser.
  255. :ivar _history: A list of ``(stack, remaining_text)`` pairs,
  256. containing all of the previous states of the parser. This
  257. history is used to implement the ``undo`` operation.
  258. :see: ``nltk.grammar``
  259. """
  260. def __init__(self, grammar, trace=0):
  261. super(SteppingShiftReduceParser, self).__init__(grammar, trace)
  262. self._stack = None
  263. self._remaining_text = None
  264. self._history = []
  265. def parse(self, tokens):
  266. tokens = list(tokens)
  267. self.initialize(tokens)
  268. while self.step():
  269. pass
  270. return self.parses()
  271. def stack(self):
  272. """
  273. :return: The parser's stack.
  274. :rtype: list(str and Tree)
  275. """
  276. return self._stack
  277. def remaining_text(self):
  278. """
  279. :return: The portion of the text that is not yet covered by the
  280. stack.
  281. :rtype: list(str)
  282. """
  283. return self._remaining_text
  284. def initialize(self, tokens):
  285. """
  286. Start parsing a given text. This sets the parser's stack to
  287. ``[]`` and sets its remaining text to ``tokens``.
  288. """
  289. self._stack = []
  290. self._remaining_text = tokens
  291. self._history = []
  292. def step(self):
  293. """
  294. Perform a single parsing operation. If a reduction is
  295. possible, then perform that reduction, and return the
  296. production that it is based on. Otherwise, if a shift is
  297. possible, then perform it, and return True. Otherwise,
  298. return False.
  299. :return: False if no operation was performed; True if a shift was
  300. performed; and the CFG production used to reduce if a
  301. reduction was performed.
  302. :rtype: Production or bool
  303. """
  304. return self.reduce() or self.shift()
  305. def shift(self):
  306. """
  307. Move a token from the beginning of the remaining text to the
  308. end of the stack. If there are no more tokens in the
  309. remaining text, then do nothing.
  310. :return: True if the shift operation was successful.
  311. :rtype: bool
  312. """
  313. if len(self._remaining_text) == 0:
  314. return False
  315. self._history.append((self._stack[:], self._remaining_text[:]))
  316. self._shift(self._stack, self._remaining_text)
  317. return True
  318. def reduce(self, production=None):
  319. """
  320. Use ``production`` to combine the rightmost stack elements into
  321. a single Tree. If ``production`` does not match the
  322. rightmost stack elements, then do nothing.
  323. :return: The production used to reduce the stack, if a
  324. reduction was performed. If no reduction was performed,
  325. return None.
  326. :rtype: Production or None
  327. """
  328. self._history.append((self._stack[:], self._remaining_text[:]))
  329. return_val = self._reduce(self._stack, self._remaining_text, production)
  330. if not return_val:
  331. self._history.pop()
  332. return return_val
  333. def undo(self):
  334. """
  335. Return the parser to its state before the most recent
  336. shift or reduce operation. Calling ``undo`` repeatedly return
  337. the parser to successively earlier states. If no shift or
  338. reduce operations have been performed, ``undo`` will make no
  339. changes.
  340. :return: true if an operation was successfully undone.
  341. :rtype: bool
  342. """
  343. if len(self._history) == 0:
  344. return False
  345. (self._stack, self._remaining_text) = self._history.pop()
  346. return True
  347. def reducible_productions(self):
  348. """
  349. :return: A list of the productions for which reductions are
  350. available for the current parser state.
  351. :rtype: list(Production)
  352. """
  353. productions = []
  354. for production in self._grammar.productions():
  355. rhslen = len(production.rhs())
  356. if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
  357. productions.append(production)
  358. return productions
  359. def parses(self):
  360. """
  361. :return: An iterator of the parses that have been found by this
  362. parser so far.
  363. :rtype: iter(Tree)
  364. """
  365. if (
  366. len(self._remaining_text) == 0
  367. and len(self._stack) == 1
  368. and self._stack[0].label() == self._grammar.start().symbol()
  369. ):
  370. yield self._stack[0]
  371. # copied from nltk.parser
  372. def set_grammar(self, grammar):
  373. """
  374. Change the grammar used to parse texts.
  375. :param grammar: The new grammar.
  376. :type grammar: CFG
  377. """
  378. self._grammar = grammar
  379. ##//////////////////////////////////////////////////////
  380. ## Demonstration Code
  381. ##//////////////////////////////////////////////////////
  382. def demo():
  383. """
  384. A demonstration of the shift-reduce parser.
  385. """
  386. from nltk import parse, CFG
  387. grammar = CFG.fromstring(
  388. """
  389. S -> NP VP
  390. NP -> Det N | Det N PP
  391. VP -> V NP | V NP PP
  392. PP -> P NP
  393. NP -> 'I'
  394. N -> 'man' | 'park' | 'telescope' | 'dog'
  395. Det -> 'the' | 'a'
  396. P -> 'in' | 'with'
  397. V -> 'saw'
  398. """
  399. )
  400. sent = 'I saw a man in the park'.split()
  401. parser = parse.ShiftReduceParser(grammar, trace=2)
  402. for p in parser.parse(sent):
  403. print(p)
  404. if __name__ == '__main__':
  405. demo()