recursivedescent.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691
  1. # Natural Language Toolkit: Recursive Descent Parser
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import print_function, unicode_literals
  9. from nltk.grammar import Nonterminal
  10. from nltk.tree import Tree, ImmutableTree
  11. from nltk.compat import unicode_repr
  12. from nltk.parse.api import ParserI
  13. ##//////////////////////////////////////////////////////
  14. ## Recursive Descent Parser
  15. ##//////////////////////////////////////////////////////
  16. class RecursiveDescentParser(ParserI):
  17. """
  18. A simple top-down CFG parser that parses texts by recursively
  19. expanding the fringe of a Tree, and matching it against a
  20. text.
  21. ``RecursiveDescentParser`` uses a list of tree locations called a
  22. "frontier" to remember which subtrees have not yet been expanded
  23. and which leaves have not yet been matched against the text. Each
  24. tree location consists of a list of child indices specifying the
  25. path from the root of the tree to a subtree or a leaf; see the
  26. reference documentation for Tree for more information
  27. about tree locations.
  28. When the parser begins parsing a text, it constructs a tree
  29. containing only the start symbol, and a frontier containing the
  30. location of the tree's root node. It then extends the tree to
  31. cover the text, using the following recursive procedure:
  32. - If the frontier is empty, and the text is covered by the tree,
  33. then return the tree as a possible parse.
  34. - If the frontier is empty, and the text is not covered by the
  35. tree, then return no parses.
  36. - If the first element of the frontier is a subtree, then
  37. use CFG productions to "expand" it. For each applicable
  38. production, add the expanded subtree's children to the
  39. frontier, and recursively find all parses that can be
  40. generated by the new tree and frontier.
  41. - If the first element of the frontier is a token, then "match"
  42. it against the next token from the text. Remove the token
  43. from the frontier, and recursively find all parses that can be
  44. generated by the new tree and frontier.
  45. :see: ``nltk.grammar``
  46. """
  47. def __init__(self, grammar, trace=0):
  48. """
  49. Create a new ``RecursiveDescentParser``, that uses ``grammar``
  50. to parse texts.
  51. :type grammar: CFG
  52. :param grammar: The grammar used to parse texts.
  53. :type trace: int
  54. :param trace: The level of tracing that should be used when
  55. parsing a text. ``0`` will generate no tracing output;
  56. and higher numbers will produce more verbose tracing
  57. output.
  58. """
  59. self._grammar = grammar
  60. self._trace = trace
  61. def grammar(self):
  62. return self._grammar
  63. def parse(self, tokens):
  64. # Inherit docs from ParserI
  65. tokens = list(tokens)
  66. self._grammar.check_coverage(tokens)
  67. # Start a recursive descent parse, with an initial tree
  68. # containing just the start symbol.
  69. start = self._grammar.start().symbol()
  70. initial_tree = Tree(start, [])
  71. frontier = [()]
  72. if self._trace:
  73. self._trace_start(initial_tree, frontier, tokens)
  74. return self._parse(tokens, initial_tree, frontier)
  75. def _parse(self, remaining_text, tree, frontier):
  76. """
  77. Recursively expand and match each elements of ``tree``
  78. specified by ``frontier``, to cover ``remaining_text``. Return
  79. a list of all parses found.
  80. :return: An iterator of all parses that can be generated by
  81. matching and expanding the elements of ``tree``
  82. specified by ``frontier``.
  83. :rtype: iter(Tree)
  84. :type tree: Tree
  85. :param tree: A partial structure for the text that is
  86. currently being parsed. The elements of ``tree``
  87. that are specified by ``frontier`` have not yet been
  88. expanded or matched.
  89. :type remaining_text: list(str)
  90. :param remaining_text: The portion of the text that is not yet
  91. covered by ``tree``.
  92. :type frontier: list(tuple(int))
  93. :param frontier: A list of the locations within ``tree`` of
  94. all subtrees that have not yet been expanded, and all
  95. leaves that have not yet been matched. This list sorted
  96. in left-to-right order of location within the tree.
  97. """
  98. # If the tree covers the text, and there's nothing left to
  99. # expand, then we've found a complete parse; return it.
  100. if len(remaining_text) == 0 and len(frontier) == 0:
  101. if self._trace:
  102. self._trace_succeed(tree, frontier)
  103. yield tree
  104. # If there's still text, but nothing left to expand, we failed.
  105. elif len(frontier) == 0:
  106. if self._trace:
  107. self._trace_backtrack(tree, frontier)
  108. # If the next element on the frontier is a tree, expand it.
  109. elif isinstance(tree[frontier[0]], Tree):
  110. for result in self._expand(remaining_text, tree, frontier):
  111. yield result
  112. # If the next element on the frontier is a token, match it.
  113. else:
  114. for result in self._match(remaining_text, tree, frontier):
  115. yield result
  116. def _match(self, rtext, tree, frontier):
  117. """
  118. :rtype: iter(Tree)
  119. :return: an iterator of all parses that can be generated by
  120. matching the first element of ``frontier`` against the
  121. first token in ``rtext``. In particular, if the first
  122. element of ``frontier`` has the same type as the first
  123. token in ``rtext``, then substitute the token into
  124. ``tree``; and return all parses that can be generated by
  125. matching and expanding the remaining elements of
  126. ``frontier``. If the first element of ``frontier`` does not
  127. have the same type as the first token in ``rtext``, then
  128. return empty list.
  129. :type tree: Tree
  130. :param tree: A partial structure for the text that is
  131. currently being parsed. The elements of ``tree``
  132. that are specified by ``frontier`` have not yet been
  133. expanded or matched.
  134. :type rtext: list(str)
  135. :param rtext: The portion of the text that is not yet
  136. covered by ``tree``.
  137. :type frontier: list of tuple of int
  138. :param frontier: A list of the locations within ``tree`` of
  139. all subtrees that have not yet been expanded, and all
  140. leaves that have not yet been matched.
  141. """
  142. tree_leaf = tree[frontier[0]]
  143. if len(rtext) > 0 and tree_leaf == rtext[0]:
  144. # If it's a terminal that matches rtext[0], then substitute
  145. # in the token, and continue parsing.
  146. newtree = tree.copy(deep=True)
  147. newtree[frontier[0]] = rtext[0]
  148. if self._trace:
  149. self._trace_match(newtree, frontier[1:], rtext[0])
  150. for result in self._parse(rtext[1:], newtree, frontier[1:]):
  151. yield result
  152. else:
  153. # If it's a non-matching terminal, fail.
  154. if self._trace:
  155. self._trace_backtrack(tree, frontier, rtext[:1])
  156. def _expand(self, remaining_text, tree, frontier, production=None):
  157. """
  158. :rtype: iter(Tree)
  159. :return: An iterator of all parses that can be generated by
  160. expanding the first element of ``frontier`` with
  161. ``production``. In particular, if the first element of
  162. ``frontier`` is a subtree whose node type is equal to
  163. ``production``'s left hand side, then add a child to that
  164. subtree for each element of ``production``'s right hand
  165. side; and return all parses that can be generated by
  166. matching and expanding the remaining elements of
  167. ``frontier``. If the first element of ``frontier`` is not a
  168. subtree whose node type is equal to ``production``'s left
  169. hand side, then return an empty list. If ``production`` is
  170. not specified, then return a list of all parses that can
  171. be generated by expanding the first element of ``frontier``
  172. with *any* CFG production.
  173. :type tree: Tree
  174. :param tree: A partial structure for the text that is
  175. currently being parsed. The elements of ``tree``
  176. that are specified by ``frontier`` have not yet been
  177. expanded or matched.
  178. :type remaining_text: list(str)
  179. :param remaining_text: The portion of the text that is not yet
  180. covered by ``tree``.
  181. :type frontier: list(tuple(int))
  182. :param frontier: A list of the locations within ``tree`` of
  183. all subtrees that have not yet been expanded, and all
  184. leaves that have not yet been matched.
  185. """
  186. if production is None:
  187. productions = self._grammar.productions()
  188. else:
  189. productions = [production]
  190. for production in productions:
  191. lhs = production.lhs().symbol()
  192. if lhs == tree[frontier[0]].label():
  193. subtree = self._production_to_tree(production)
  194. if frontier[0] == ():
  195. newtree = subtree
  196. else:
  197. newtree = tree.copy(deep=True)
  198. newtree[frontier[0]] = subtree
  199. new_frontier = [
  200. frontier[0] + (i,) for i in range(len(production.rhs()))
  201. ]
  202. if self._trace:
  203. self._trace_expand(newtree, new_frontier, production)
  204. for result in self._parse(
  205. remaining_text, newtree, new_frontier + frontier[1:]
  206. ):
  207. yield result
  208. def _production_to_tree(self, production):
  209. """
  210. :rtype: Tree
  211. :return: The Tree that is licensed by ``production``.
  212. In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
  213. return a tree that has a node ``lhs.symbol``, and
  214. ``n`` children. For each nonterminal element
  215. ``elt[i]`` in the production, the tree token has a
  216. childless subtree with node value ``elt[i].symbol``; and
  217. for each terminal element ``elt[j]``, the tree token has
  218. a leaf token with type ``elt[j]``.
  219. :param production: The CFG production that licenses the tree
  220. token that should be returned.
  221. :type production: Production
  222. """
  223. children = []
  224. for elt in production.rhs():
  225. if isinstance(elt, Nonterminal):
  226. children.append(Tree(elt.symbol(), []))
  227. else:
  228. # This will be matched.
  229. children.append(elt)
  230. return Tree(production.lhs().symbol(), children)
  231. def trace(self, trace=2):
  232. """
  233. Set the level of tracing output that should be generated when
  234. parsing a text.
  235. :type trace: int
  236. :param trace: The trace level. A trace level of ``0`` will
  237. generate no tracing output; and higher trace levels will
  238. produce more verbose tracing output.
  239. :rtype: None
  240. """
  241. self._trace = trace
  242. def _trace_fringe(self, tree, treeloc=None):
  243. """
  244. Print trace output displaying the fringe of ``tree``. The
  245. fringe of ``tree`` consists of all of its leaves and all of
  246. its childless subtrees.
  247. :rtype: None
  248. """
  249. if treeloc == ():
  250. print("*", end=' ')
  251. if isinstance(tree, Tree):
  252. if len(tree) == 0:
  253. print(unicode_repr(Nonterminal(tree.label())), end=' ')
  254. for i in range(len(tree)):
  255. if treeloc is not None and i == treeloc[0]:
  256. self._trace_fringe(tree[i], treeloc[1:])
  257. else:
  258. self._trace_fringe(tree[i])
  259. else:
  260. print(unicode_repr(tree), end=' ')
  261. def _trace_tree(self, tree, frontier, operation):
  262. """
  263. Print trace output displaying the parser's current state.
  264. :param operation: A character identifying the operation that
  265. generated the current state.
  266. :rtype: None
  267. """
  268. if self._trace == 2:
  269. print(' %c [' % operation, end=' ')
  270. else:
  271. print(' [', end=' ')
  272. if len(frontier) > 0:
  273. self._trace_fringe(tree, frontier[0])
  274. else:
  275. self._trace_fringe(tree)
  276. print(']')
  277. def _trace_start(self, tree, frontier, text):
  278. print('Parsing %r' % " ".join(text))
  279. if self._trace > 2:
  280. print('Start:')
  281. if self._trace > 1:
  282. self._trace_tree(tree, frontier, ' ')
  283. def _trace_expand(self, tree, frontier, production):
  284. if self._trace > 2:
  285. print('Expand: %s' % production)
  286. if self._trace > 1:
  287. self._trace_tree(tree, frontier, 'E')
  288. def _trace_match(self, tree, frontier, tok):
  289. if self._trace > 2:
  290. print('Match: %r' % tok)
  291. if self._trace > 1:
  292. self._trace_tree(tree, frontier, 'M')
  293. def _trace_succeed(self, tree, frontier):
  294. if self._trace > 2:
  295. print('GOOD PARSE:')
  296. if self._trace == 1:
  297. print('Found a parse:\n%s' % tree)
  298. if self._trace > 1:
  299. self._trace_tree(tree, frontier, '+')
  300. def _trace_backtrack(self, tree, frontier, toks=None):
  301. if self._trace > 2:
  302. if toks:
  303. print('Backtrack: %r match failed' % toks[0])
  304. else:
  305. print('Backtrack')
  306. ##//////////////////////////////////////////////////////
  307. ## Stepping Recursive Descent Parser
  308. ##//////////////////////////////////////////////////////
  309. class SteppingRecursiveDescentParser(RecursiveDescentParser):
  310. """
  311. A ``RecursiveDescentParser`` that allows you to step through the
  312. parsing process, performing a single operation at a time.
  313. The ``initialize`` method is used to start parsing a text.
  314. ``expand`` expands the first element on the frontier using a single
  315. CFG production, and ``match`` matches the first element on the
  316. frontier against the next text token. ``backtrack`` undoes the most
  317. recent expand or match operation. ``step`` performs a single
  318. expand, match, or backtrack operation. ``parses`` returns the set
  319. of parses that have been found by the parser.
  320. :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
  321. containing the previous states of the parser. This history is
  322. used to implement the ``backtrack`` operation.
  323. :ivar _tried_e: A record of all productions that have been tried
  324. for a given tree. This record is used by ``expand`` to perform
  325. the next untried production.
  326. :ivar _tried_m: A record of what tokens have been matched for a
  327. given tree. This record is used by ``step`` to decide whether
  328. or not to match a token.
  329. :see: ``nltk.grammar``
  330. """
  331. def __init__(self, grammar, trace=0):
  332. super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
  333. self._rtext = None
  334. self._tree = None
  335. self._frontier = [()]
  336. self._tried_e = {}
  337. self._tried_m = {}
  338. self._history = []
  339. self._parses = []
  340. # [XX] TEMPORARY HACK WARNING! This should be replaced with
  341. # something nicer when we get the chance.
  342. def _freeze(self, tree):
  343. c = tree.copy()
  344. # for pos in c.treepositions('leaves'):
  345. # c[pos] = c[pos].freeze()
  346. return ImmutableTree.convert(c)
  347. def parse(self, tokens):
  348. tokens = list(tokens)
  349. self.initialize(tokens)
  350. while self.step() is not None:
  351. pass
  352. return self.parses()
  353. def initialize(self, tokens):
  354. """
  355. Start parsing a given text. This sets the parser's tree to
  356. the start symbol, its frontier to the root node, and its
  357. remaining text to ``token['SUBTOKENS']``.
  358. """
  359. self._rtext = tokens
  360. start = self._grammar.start().symbol()
  361. self._tree = Tree(start, [])
  362. self._frontier = [()]
  363. self._tried_e = {}
  364. self._tried_m = {}
  365. self._history = []
  366. self._parses = []
  367. if self._trace:
  368. self._trace_start(self._tree, self._frontier, self._rtext)
  369. def remaining_text(self):
  370. """
  371. :return: The portion of the text that is not yet covered by the
  372. tree.
  373. :rtype: list(str)
  374. """
  375. return self._rtext
  376. def frontier(self):
  377. """
  378. :return: A list of the tree locations of all subtrees that
  379. have not yet been expanded, and all leaves that have not
  380. yet been matched.
  381. :rtype: list(tuple(int))
  382. """
  383. return self._frontier
  384. def tree(self):
  385. """
  386. :return: A partial structure for the text that is
  387. currently being parsed. The elements specified by the
  388. frontier have not yet been expanded or matched.
  389. :rtype: Tree
  390. """
  391. return self._tree
  392. def step(self):
  393. """
  394. Perform a single parsing operation. If an untried match is
  395. possible, then perform the match, and return the matched
  396. token. If an untried expansion is possible, then perform the
  397. expansion, and return the production that it is based on. If
  398. backtracking is possible, then backtrack, and return True.
  399. Otherwise, return None.
  400. :return: None if no operation was performed; a token if a match
  401. was performed; a production if an expansion was performed;
  402. and True if a backtrack operation was performed.
  403. :rtype: Production or String or bool
  404. """
  405. # Try matching (if we haven't already)
  406. if self.untried_match():
  407. token = self.match()
  408. if token is not None:
  409. return token
  410. # Try expanding.
  411. production = self.expand()
  412. if production is not None:
  413. return production
  414. # Try backtracking
  415. if self.backtrack():
  416. self._trace_backtrack(self._tree, self._frontier)
  417. return True
  418. # Nothing left to do.
  419. return None
  420. def expand(self, production=None):
  421. """
  422. Expand the first element of the frontier. In particular, if
  423. the first element of the frontier is a subtree whose node type
  424. is equal to ``production``'s left hand side, then add a child
  425. to that subtree for each element of ``production``'s right hand
  426. side. If ``production`` is not specified, then use the first
  427. untried expandable production. If all expandable productions
  428. have been tried, do nothing.
  429. :return: The production used to expand the frontier, if an
  430. expansion was performed. If no expansion was performed,
  431. return None.
  432. :rtype: Production or None
  433. """
  434. # Make sure we *can* expand.
  435. if len(self._frontier) == 0:
  436. return None
  437. if not isinstance(self._tree[self._frontier[0]], Tree):
  438. return None
  439. # If they didn't specify a production, check all untried ones.
  440. if production is None:
  441. productions = self.untried_expandable_productions()
  442. else:
  443. productions = [production]
  444. parses = []
  445. for prod in productions:
  446. # Record that we've tried this production now.
  447. self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
  448. # Try expanding.
  449. for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
  450. return prod
  451. # We didn't expand anything.
  452. return None
  453. def match(self):
  454. """
  455. Match the first element of the frontier. In particular, if
  456. the first element of the frontier has the same type as the
  457. next text token, then substitute the text token into the tree.
  458. :return: The token matched, if a match operation was
  459. performed. If no match was performed, return None
  460. :rtype: str or None
  461. """
  462. # Record that we've tried matching this token.
  463. tok = self._rtext[0]
  464. self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
  465. # Make sure we *can* match.
  466. if len(self._frontier) == 0:
  467. return None
  468. if isinstance(self._tree[self._frontier[0]], Tree):
  469. return None
  470. for _result in self._match(self._rtext, self._tree, self._frontier):
  471. # Return the token we just matched.
  472. return self._history[-1][0][0]
  473. return None
  474. def backtrack(self):
  475. """
  476. Return the parser to its state before the most recent
  477. match or expand operation. Calling ``undo`` repeatedly return
  478. the parser to successively earlier states. If no match or
  479. expand operations have been performed, ``undo`` will make no
  480. changes.
  481. :return: true if an operation was successfully undone.
  482. :rtype: bool
  483. """
  484. if len(self._history) == 0:
  485. return False
  486. (self._rtext, self._tree, self._frontier) = self._history.pop()
  487. return True
  488. def expandable_productions(self):
  489. """
  490. :return: A list of all the productions for which expansions
  491. are available for the current parser state.
  492. :rtype: list(Production)
  493. """
  494. # Make sure we *can* expand.
  495. if len(self._frontier) == 0:
  496. return []
  497. frontier_child = self._tree[self._frontier[0]]
  498. if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
  499. return []
  500. return [
  501. p
  502. for p in self._grammar.productions()
  503. if p.lhs().symbol() == frontier_child.label()
  504. ]
  505. def untried_expandable_productions(self):
  506. """
  507. :return: A list of all the untried productions for which
  508. expansions are available for the current parser state.
  509. :rtype: list(Production)
  510. """
  511. tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
  512. return [p for p in self.expandable_productions() if p not in tried_expansions]
  513. def untried_match(self):
  514. """
  515. :return: Whether the first element of the frontier is a token
  516. that has not yet been matched.
  517. :rtype: bool
  518. """
  519. if len(self._rtext) == 0:
  520. return False
  521. tried_matches = self._tried_m.get(self._freeze(self._tree), [])
  522. return self._rtext[0] not in tried_matches
  523. def currently_complete(self):
  524. """
  525. :return: Whether the parser's current state represents a
  526. complete parse.
  527. :rtype: bool
  528. """
  529. return len(self._frontier) == 0 and len(self._rtext) == 0
  530. def _parse(self, remaining_text, tree, frontier):
  531. """
  532. A stub version of ``_parse`` that sets the parsers current
  533. state to the given arguments. In ``RecursiveDescentParser``,
  534. the ``_parse`` method is used to recursively continue parsing a
  535. text. ``SteppingRecursiveDescentParser`` overrides it to
  536. capture these recursive calls. It records the parser's old
  537. state in the history (to allow for backtracking), and updates
  538. the parser's new state using the given arguments. Finally, it
  539. returns ``[1]``, which is used by ``match`` and ``expand`` to
  540. detect whether their operations were successful.
  541. :return: ``[1]``
  542. :rtype: list of int
  543. """
  544. self._history.append((self._rtext, self._tree, self._frontier))
  545. self._rtext = remaining_text
  546. self._tree = tree
  547. self._frontier = frontier
  548. # Is it a good parse? If so, record it.
  549. if len(frontier) == 0 and len(remaining_text) == 0:
  550. self._parses.append(tree)
  551. self._trace_succeed(self._tree, self._frontier)
  552. return [1]
  553. def parses(self):
  554. """
  555. :return: An iterator of the parses that have been found by this
  556. parser so far.
  557. :rtype: list of Tree
  558. """
  559. return iter(self._parses)
  560. def set_grammar(self, grammar):
  561. """
  562. Change the grammar used to parse texts.
  563. :param grammar: The new grammar.
  564. :type grammar: CFG
  565. """
  566. self._grammar = grammar
  567. ##//////////////////////////////////////////////////////
  568. ## Demonstration Code
  569. ##//////////////////////////////////////////////////////
  570. def demo():
  571. """
  572. A demonstration of the recursive descent parser.
  573. """
  574. from nltk import parse, CFG
  575. grammar = CFG.fromstring(
  576. """
  577. S -> NP VP
  578. NP -> Det N | Det N PP
  579. VP -> V NP | V NP PP
  580. PP -> P NP
  581. NP -> 'I'
  582. N -> 'man' | 'park' | 'telescope' | 'dog'
  583. Det -> 'the' | 'a'
  584. P -> 'in' | 'with'
  585. V -> 'saw'
  586. """
  587. )
  588. for prod in grammar.productions():
  589. print(prod)
  590. sent = 'I saw a man in the park'.split()
  591. parser = parse.RecursiveDescentParser(grammar, trace=2)
  592. for p in parser.parse(sent):
  593. print(p)
  594. if __name__ == '__main__':
  595. demo()