123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691 |
- # Natural Language Toolkit: Recursive Descent Parser
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function, unicode_literals
- from nltk.grammar import Nonterminal
- from nltk.tree import Tree, ImmutableTree
- from nltk.compat import unicode_repr
- from nltk.parse.api import ParserI
- ##//////////////////////////////////////////////////////
- ## Recursive Descent Parser
- ##//////////////////////////////////////////////////////
- class RecursiveDescentParser(ParserI):
- """
- A simple top-down CFG parser that parses texts by recursively
- expanding the fringe of a Tree, and matching it against a
- text.
- ``RecursiveDescentParser`` uses a list of tree locations called a
- "frontier" to remember which subtrees have not yet been expanded
- and which leaves have not yet been matched against the text. Each
- tree location consists of a list of child indices specifying the
- path from the root of the tree to a subtree or a leaf; see the
- reference documentation for Tree for more information
- about tree locations.
- When the parser begins parsing a text, it constructs a tree
- containing only the start symbol, and a frontier containing the
- location of the tree's root node. It then extends the tree to
- cover the text, using the following recursive procedure:
- - If the frontier is empty, and the text is covered by the tree,
- then return the tree as a possible parse.
- - If the frontier is empty, and the text is not covered by the
- tree, then return no parses.
- - If the first element of the frontier is a subtree, then
- use CFG productions to "expand" it. For each applicable
- production, add the expanded subtree's children to the
- frontier, and recursively find all parses that can be
- generated by the new tree and frontier.
- - If the first element of the frontier is a token, then "match"
- it against the next token from the text. Remove the token
- from the frontier, and recursively find all parses that can be
- generated by the new tree and frontier.
- :see: ``nltk.grammar``
- """
- def __init__(self, grammar, trace=0):
- """
- Create a new ``RecursiveDescentParser``, that uses ``grammar``
- to parse texts.
- :type grammar: CFG
- :param grammar: The grammar used to parse texts.
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- and higher numbers will produce more verbose tracing
- output.
- """
- self._grammar = grammar
- self._trace = trace
- def grammar(self):
- return self._grammar
- def parse(self, tokens):
- # Inherit docs from ParserI
- tokens = list(tokens)
- self._grammar.check_coverage(tokens)
- # Start a recursive descent parse, with an initial tree
- # containing just the start symbol.
- start = self._grammar.start().symbol()
- initial_tree = Tree(start, [])
- frontier = [()]
- if self._trace:
- self._trace_start(initial_tree, frontier, tokens)
- return self._parse(tokens, initial_tree, frontier)
- def _parse(self, remaining_text, tree, frontier):
- """
- Recursively expand and match each elements of ``tree``
- specified by ``frontier``, to cover ``remaining_text``. Return
- a list of all parses found.
- :return: An iterator of all parses that can be generated by
- matching and expanding the elements of ``tree``
- specified by ``frontier``.
- :rtype: iter(Tree)
- :type tree: Tree
- :param tree: A partial structure for the text that is
- currently being parsed. The elements of ``tree``
- that are specified by ``frontier`` have not yet been
- expanded or matched.
- :type remaining_text: list(str)
- :param remaining_text: The portion of the text that is not yet
- covered by ``tree``.
- :type frontier: list(tuple(int))
- :param frontier: A list of the locations within ``tree`` of
- all subtrees that have not yet been expanded, and all
- leaves that have not yet been matched. This list sorted
- in left-to-right order of location within the tree.
- """
- # If the tree covers the text, and there's nothing left to
- # expand, then we've found a complete parse; return it.
- if len(remaining_text) == 0 and len(frontier) == 0:
- if self._trace:
- self._trace_succeed(tree, frontier)
- yield tree
- # If there's still text, but nothing left to expand, we failed.
- elif len(frontier) == 0:
- if self._trace:
- self._trace_backtrack(tree, frontier)
- # If the next element on the frontier is a tree, expand it.
- elif isinstance(tree[frontier[0]], Tree):
- for result in self._expand(remaining_text, tree, frontier):
- yield result
- # If the next element on the frontier is a token, match it.
- else:
- for result in self._match(remaining_text, tree, frontier):
- yield result
- def _match(self, rtext, tree, frontier):
- """
- :rtype: iter(Tree)
- :return: an iterator of all parses that can be generated by
- matching the first element of ``frontier`` against the
- first token in ``rtext``. In particular, if the first
- element of ``frontier`` has the same type as the first
- token in ``rtext``, then substitute the token into
- ``tree``; and return all parses that can be generated by
- matching and expanding the remaining elements of
- ``frontier``. If the first element of ``frontier`` does not
- have the same type as the first token in ``rtext``, then
- return empty list.
- :type tree: Tree
- :param tree: A partial structure for the text that is
- currently being parsed. The elements of ``tree``
- that are specified by ``frontier`` have not yet been
- expanded or matched.
- :type rtext: list(str)
- :param rtext: The portion of the text that is not yet
- covered by ``tree``.
- :type frontier: list of tuple of int
- :param frontier: A list of the locations within ``tree`` of
- all subtrees that have not yet been expanded, and all
- leaves that have not yet been matched.
- """
- tree_leaf = tree[frontier[0]]
- if len(rtext) > 0 and tree_leaf == rtext[0]:
- # If it's a terminal that matches rtext[0], then substitute
- # in the token, and continue parsing.
- newtree = tree.copy(deep=True)
- newtree[frontier[0]] = rtext[0]
- if self._trace:
- self._trace_match(newtree, frontier[1:], rtext[0])
- for result in self._parse(rtext[1:], newtree, frontier[1:]):
- yield result
- else:
- # If it's a non-matching terminal, fail.
- if self._trace:
- self._trace_backtrack(tree, frontier, rtext[:1])
- def _expand(self, remaining_text, tree, frontier, production=None):
- """
- :rtype: iter(Tree)
- :return: An iterator of all parses that can be generated by
- expanding the first element of ``frontier`` with
- ``production``. In particular, if the first element of
- ``frontier`` is a subtree whose node type is equal to
- ``production``'s left hand side, then add a child to that
- subtree for each element of ``production``'s right hand
- side; and return all parses that can be generated by
- matching and expanding the remaining elements of
- ``frontier``. If the first element of ``frontier`` is not a
- subtree whose node type is equal to ``production``'s left
- hand side, then return an empty list. If ``production`` is
- not specified, then return a list of all parses that can
- be generated by expanding the first element of ``frontier``
- with *any* CFG production.
- :type tree: Tree
- :param tree: A partial structure for the text that is
- currently being parsed. The elements of ``tree``
- that are specified by ``frontier`` have not yet been
- expanded or matched.
- :type remaining_text: list(str)
- :param remaining_text: The portion of the text that is not yet
- covered by ``tree``.
- :type frontier: list(tuple(int))
- :param frontier: A list of the locations within ``tree`` of
- all subtrees that have not yet been expanded, and all
- leaves that have not yet been matched.
- """
- if production is None:
- productions = self._grammar.productions()
- else:
- productions = [production]
- for production in productions:
- lhs = production.lhs().symbol()
- if lhs == tree[frontier[0]].label():
- subtree = self._production_to_tree(production)
- if frontier[0] == ():
- newtree = subtree
- else:
- newtree = tree.copy(deep=True)
- newtree[frontier[0]] = subtree
- new_frontier = [
- frontier[0] + (i,) for i in range(len(production.rhs()))
- ]
- if self._trace:
- self._trace_expand(newtree, new_frontier, production)
- for result in self._parse(
- remaining_text, newtree, new_frontier + frontier[1:]
- ):
- yield result
- def _production_to_tree(self, production):
- """
- :rtype: Tree
- :return: The Tree that is licensed by ``production``.
- In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
- return a tree that has a node ``lhs.symbol``, and
- ``n`` children. For each nonterminal element
- ``elt[i]`` in the production, the tree token has a
- childless subtree with node value ``elt[i].symbol``; and
- for each terminal element ``elt[j]``, the tree token has
- a leaf token with type ``elt[j]``.
- :param production: The CFG production that licenses the tree
- token that should be returned.
- :type production: Production
- """
- children = []
- for elt in production.rhs():
- if isinstance(elt, Nonterminal):
- children.append(Tree(elt.symbol(), []))
- else:
- # This will be matched.
- children.append(elt)
- return Tree(production.lhs().symbol(), children)
- def trace(self, trace=2):
- """
- Set the level of tracing output that should be generated when
- parsing a text.
- :type trace: int
- :param trace: The trace level. A trace level of ``0`` will
- generate no tracing output; and higher trace levels will
- produce more verbose tracing output.
- :rtype: None
- """
- self._trace = trace
- def _trace_fringe(self, tree, treeloc=None):
- """
- Print trace output displaying the fringe of ``tree``. The
- fringe of ``tree`` consists of all of its leaves and all of
- its childless subtrees.
- :rtype: None
- """
- if treeloc == ():
- print("*", end=' ')
- if isinstance(tree, Tree):
- if len(tree) == 0:
- print(unicode_repr(Nonterminal(tree.label())), end=' ')
- for i in range(len(tree)):
- if treeloc is not None and i == treeloc[0]:
- self._trace_fringe(tree[i], treeloc[1:])
- else:
- self._trace_fringe(tree[i])
- else:
- print(unicode_repr(tree), end=' ')
- def _trace_tree(self, tree, frontier, operation):
- """
- Print trace output displaying the parser's current state.
- :param operation: A character identifying the operation that
- generated the current state.
- :rtype: None
- """
- if self._trace == 2:
- print(' %c [' % operation, end=' ')
- else:
- print(' [', end=' ')
- if len(frontier) > 0:
- self._trace_fringe(tree, frontier[0])
- else:
- self._trace_fringe(tree)
- print(']')
- def _trace_start(self, tree, frontier, text):
- print('Parsing %r' % " ".join(text))
- if self._trace > 2:
- print('Start:')
- if self._trace > 1:
- self._trace_tree(tree, frontier, ' ')
- def _trace_expand(self, tree, frontier, production):
- if self._trace > 2:
- print('Expand: %s' % production)
- if self._trace > 1:
- self._trace_tree(tree, frontier, 'E')
- def _trace_match(self, tree, frontier, tok):
- if self._trace > 2:
- print('Match: %r' % tok)
- if self._trace > 1:
- self._trace_tree(tree, frontier, 'M')
- def _trace_succeed(self, tree, frontier):
- if self._trace > 2:
- print('GOOD PARSE:')
- if self._trace == 1:
- print('Found a parse:\n%s' % tree)
- if self._trace > 1:
- self._trace_tree(tree, frontier, '+')
- def _trace_backtrack(self, tree, frontier, toks=None):
- if self._trace > 2:
- if toks:
- print('Backtrack: %r match failed' % toks[0])
- else:
- print('Backtrack')
- ##//////////////////////////////////////////////////////
- ## Stepping Recursive Descent Parser
- ##//////////////////////////////////////////////////////
- class SteppingRecursiveDescentParser(RecursiveDescentParser):
- """
- A ``RecursiveDescentParser`` that allows you to step through the
- parsing process, performing a single operation at a time.
- The ``initialize`` method is used to start parsing a text.
- ``expand`` expands the first element on the frontier using a single
- CFG production, and ``match`` matches the first element on the
- frontier against the next text token. ``backtrack`` undoes the most
- recent expand or match operation. ``step`` performs a single
- expand, match, or backtrack operation. ``parses`` returns the set
- of parses that have been found by the parser.
- :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
- containing the previous states of the parser. This history is
- used to implement the ``backtrack`` operation.
- :ivar _tried_e: A record of all productions that have been tried
- for a given tree. This record is used by ``expand`` to perform
- the next untried production.
- :ivar _tried_m: A record of what tokens have been matched for a
- given tree. This record is used by ``step`` to decide whether
- or not to match a token.
- :see: ``nltk.grammar``
- """
- def __init__(self, grammar, trace=0):
- super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
- self._rtext = None
- self._tree = None
- self._frontier = [()]
- self._tried_e = {}
- self._tried_m = {}
- self._history = []
- self._parses = []
- # [XX] TEMPORARY HACK WARNING! This should be replaced with
- # something nicer when we get the chance.
- def _freeze(self, tree):
- c = tree.copy()
- # for pos in c.treepositions('leaves'):
- # c[pos] = c[pos].freeze()
- return ImmutableTree.convert(c)
- def parse(self, tokens):
- tokens = list(tokens)
- self.initialize(tokens)
- while self.step() is not None:
- pass
- return self.parses()
- def initialize(self, tokens):
- """
- Start parsing a given text. This sets the parser's tree to
- the start symbol, its frontier to the root node, and its
- remaining text to ``token['SUBTOKENS']``.
- """
- self._rtext = tokens
- start = self._grammar.start().symbol()
- self._tree = Tree(start, [])
- self._frontier = [()]
- self._tried_e = {}
- self._tried_m = {}
- self._history = []
- self._parses = []
- if self._trace:
- self._trace_start(self._tree, self._frontier, self._rtext)
- def remaining_text(self):
- """
- :return: The portion of the text that is not yet covered by the
- tree.
- :rtype: list(str)
- """
- return self._rtext
- def frontier(self):
- """
- :return: A list of the tree locations of all subtrees that
- have not yet been expanded, and all leaves that have not
- yet been matched.
- :rtype: list(tuple(int))
- """
- return self._frontier
- def tree(self):
- """
- :return: A partial structure for the text that is
- currently being parsed. The elements specified by the
- frontier have not yet been expanded or matched.
- :rtype: Tree
- """
- return self._tree
- def step(self):
- """
- Perform a single parsing operation. If an untried match is
- possible, then perform the match, and return the matched
- token. If an untried expansion is possible, then perform the
- expansion, and return the production that it is based on. If
- backtracking is possible, then backtrack, and return True.
- Otherwise, return None.
- :return: None if no operation was performed; a token if a match
- was performed; a production if an expansion was performed;
- and True if a backtrack operation was performed.
- :rtype: Production or String or bool
- """
- # Try matching (if we haven't already)
- if self.untried_match():
- token = self.match()
- if token is not None:
- return token
- # Try expanding.
- production = self.expand()
- if production is not None:
- return production
- # Try backtracking
- if self.backtrack():
- self._trace_backtrack(self._tree, self._frontier)
- return True
- # Nothing left to do.
- return None
- def expand(self, production=None):
- """
- Expand the first element of the frontier. In particular, if
- the first element of the frontier is a subtree whose node type
- is equal to ``production``'s left hand side, then add a child
- to that subtree for each element of ``production``'s right hand
- side. If ``production`` is not specified, then use the first
- untried expandable production. If all expandable productions
- have been tried, do nothing.
- :return: The production used to expand the frontier, if an
- expansion was performed. If no expansion was performed,
- return None.
- :rtype: Production or None
- """
- # Make sure we *can* expand.
- if len(self._frontier) == 0:
- return None
- if not isinstance(self._tree[self._frontier[0]], Tree):
- return None
- # If they didn't specify a production, check all untried ones.
- if production is None:
- productions = self.untried_expandable_productions()
- else:
- productions = [production]
- parses = []
- for prod in productions:
- # Record that we've tried this production now.
- self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
- # Try expanding.
- for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
- return prod
- # We didn't expand anything.
- return None
- def match(self):
- """
- Match the first element of the frontier. In particular, if
- the first element of the frontier has the same type as the
- next text token, then substitute the text token into the tree.
- :return: The token matched, if a match operation was
- performed. If no match was performed, return None
- :rtype: str or None
- """
- # Record that we've tried matching this token.
- tok = self._rtext[0]
- self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
- # Make sure we *can* match.
- if len(self._frontier) == 0:
- return None
- if isinstance(self._tree[self._frontier[0]], Tree):
- return None
- for _result in self._match(self._rtext, self._tree, self._frontier):
- # Return the token we just matched.
- return self._history[-1][0][0]
- return None
- def backtrack(self):
- """
- Return the parser to its state before the most recent
- match or expand operation. Calling ``undo`` repeatedly return
- the parser to successively earlier states. If no match or
- expand operations have been performed, ``undo`` will make no
- changes.
- :return: true if an operation was successfully undone.
- :rtype: bool
- """
- if len(self._history) == 0:
- return False
- (self._rtext, self._tree, self._frontier) = self._history.pop()
- return True
- def expandable_productions(self):
- """
- :return: A list of all the productions for which expansions
- are available for the current parser state.
- :rtype: list(Production)
- """
- # Make sure we *can* expand.
- if len(self._frontier) == 0:
- return []
- frontier_child = self._tree[self._frontier[0]]
- if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
- return []
- return [
- p
- for p in self._grammar.productions()
- if p.lhs().symbol() == frontier_child.label()
- ]
- def untried_expandable_productions(self):
- """
- :return: A list of all the untried productions for which
- expansions are available for the current parser state.
- :rtype: list(Production)
- """
- tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
- return [p for p in self.expandable_productions() if p not in tried_expansions]
- def untried_match(self):
- """
- :return: Whether the first element of the frontier is a token
- that has not yet been matched.
- :rtype: bool
- """
- if len(self._rtext) == 0:
- return False
- tried_matches = self._tried_m.get(self._freeze(self._tree), [])
- return self._rtext[0] not in tried_matches
- def currently_complete(self):
- """
- :return: Whether the parser's current state represents a
- complete parse.
- :rtype: bool
- """
- return len(self._frontier) == 0 and len(self._rtext) == 0
- def _parse(self, remaining_text, tree, frontier):
- """
- A stub version of ``_parse`` that sets the parsers current
- state to the given arguments. In ``RecursiveDescentParser``,
- the ``_parse`` method is used to recursively continue parsing a
- text. ``SteppingRecursiveDescentParser`` overrides it to
- capture these recursive calls. It records the parser's old
- state in the history (to allow for backtracking), and updates
- the parser's new state using the given arguments. Finally, it
- returns ``[1]``, which is used by ``match`` and ``expand`` to
- detect whether their operations were successful.
- :return: ``[1]``
- :rtype: list of int
- """
- self._history.append((self._rtext, self._tree, self._frontier))
- self._rtext = remaining_text
- self._tree = tree
- self._frontier = frontier
- # Is it a good parse? If so, record it.
- if len(frontier) == 0 and len(remaining_text) == 0:
- self._parses.append(tree)
- self._trace_succeed(self._tree, self._frontier)
- return [1]
- def parses(self):
- """
- :return: An iterator of the parses that have been found by this
- parser so far.
- :rtype: list of Tree
- """
- return iter(self._parses)
- def set_grammar(self, grammar):
- """
- Change the grammar used to parse texts.
- :param grammar: The new grammar.
- :type grammar: CFG
- """
- self._grammar = grammar
- ##//////////////////////////////////////////////////////
- ## Demonstration Code
- ##//////////////////////////////////////////////////////
- def demo():
- """
- A demonstration of the recursive descent parser.
- """
- from nltk import parse, CFG
- grammar = CFG.fromstring(
- """
- S -> NP VP
- NP -> Det N | Det N PP
- VP -> V NP | V NP PP
- PP -> P NP
- NP -> 'I'
- N -> 'man' | 'park' | 'telescope' | 'dog'
- Det -> 'the' | 'a'
- P -> 'in' | 'with'
- V -> 'saw'
- """
- )
- for prod in grammar.productions():
- print(prod)
- sent = 'I saw a man in the park'.split()
- parser = parse.RecursiveDescentParser(grammar, trace=2)
- for p in parser.parse(sent):
- print(p)
- if __name__ == '__main__':
- demo()
|