123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- # Natural Language Toolkit: Viterbi Probabilistic Parser
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function, unicode_literals
- from functools import reduce
- from nltk.tree import Tree, ProbabilisticTree
- from nltk.compat import python_2_unicode_compatible
- from nltk.parse.api import ParserI
- ##//////////////////////////////////////////////////////
- ## Viterbi PCFG Parser
- ##//////////////////////////////////////////////////////
- @python_2_unicode_compatible
- class ViterbiParser(ParserI):
- """
- A bottom-up ``PCFG`` parser that uses dynamic programming to find
- the single most likely parse for a text. The ``ViterbiParser`` parser
- parses texts by filling in a "most likely constituent table".
- This table records the most probable tree representation for any
- given span and node value. In particular, it has an entry for
- every start index, end index, and node value, recording the most
- likely subtree that spans from the start index to the end index,
- and has the given node value.
- The ``ViterbiParser`` parser fills in this table incrementally. It starts
- by filling in all entries for constituents that span one element
- of text (i.e., entries where the end index is one greater than the
- start index). After it has filled in all table entries for
- constituents that span one element of text, it fills in the
- entries for constitutants that span two elements of text. It
- continues filling in the entries for constituents spanning larger
- and larger portions of the text, until the entire table has been
- filled. Finally, it returns the table entry for a constituent
- spanning the entire text, whose node value is the grammar's start
- symbol.
- In order to find the most likely constituent with a given span and
- node value, the ``ViterbiParser`` parser considers all productions that
- could produce that node value. For each production, it finds all
- children that collectively cover the span and have the node values
- specified by the production's right hand side. If the probability
- of the tree formed by applying the production to the children is
- greater than the probability of the current entry in the table,
- then the table is updated with this new tree.
- A pseudo-code description of the algorithm used by
- ``ViterbiParser`` is:
- | Create an empty most likely constituent table, *MLC*.
- | For width in 1...len(text):
- | For start in 1...len(text)-width:
- | For prod in grammar.productions:
- | For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC,
- | where t[i].label()==prod.rhs[i],
- | and the sequence covers [start:start+width]:
- | old_p = MLC[start, start+width, prod.lhs]
- | new_p = P(t[1])P(t[1])...P(t[n])P(prod)
- | if new_p > old_p:
- | new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n])
- | MLC[start, start+width, prod.lhs] = new_tree
- | Return MLC[0, len(text), start_symbol]
- :type _grammar: PCFG
- :ivar _grammar: The grammar used to parse sentences.
- :type _trace: int
- :ivar _trace: The level of tracing output that should be generated
- when parsing a text.
- """
- def __init__(self, grammar, trace=0):
- """
- Create a new ``ViterbiParser`` parser, that uses ``grammar`` to
- parse texts.
- :type grammar: PCFG
- :param grammar: The grammar used to parse texts.
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- and higher numbers will produce more verbose tracing
- output.
- """
- self._grammar = grammar
- self._trace = trace
- def grammar(self):
- return self._grammar
- def trace(self, trace=2):
- """
- Set the level of tracing output that should be generated when
- parsing a text.
- :type trace: int
- :param trace: The trace level. A trace level of ``0`` will
- generate no tracing output; and higher trace levels will
- produce more verbose tracing output.
- :rtype: None
- """
- self._trace = trace
- def parse(self, tokens):
- # Inherit docs from ParserI
- tokens = list(tokens)
- self._grammar.check_coverage(tokens)
- # The most likely constituent table. This table specifies the
- # most likely constituent for a given span and type.
- # Constituents can be either Trees or tokens. For Trees,
- # the "type" is the Nonterminal for the tree's root node
- # value. For Tokens, the "type" is the token's type.
- # The table is stored as a dictionary, since it is sparse.
- constituents = {}
- # Initialize the constituents dictionary with the words from
- # the text.
- if self._trace:
- print(('Inserting tokens into the most likely' + ' constituents table...'))
- for index in range(len(tokens)):
- token = tokens[index]
- constituents[index, index + 1, token] = token
- if self._trace > 1:
- self._trace_lexical_insertion(token, index, len(tokens))
- # Consider each span of length 1, 2, ..., n; and add any trees
- # that might cover that span to the constituents dictionary.
- for length in range(1, len(tokens) + 1):
- if self._trace:
- print(
- (
- 'Finding the most likely constituents'
- + ' spanning %d text elements...' % length
- )
- )
- for start in range(len(tokens) - length + 1):
- span = (start, start + length)
- self._add_constituents_spanning(span, constituents, tokens)
- # Return the tree that spans the entire text & have the right cat
- tree = constituents.get((0, len(tokens), self._grammar.start()))
- if tree is not None:
- yield tree
- def _add_constituents_spanning(self, span, constituents, tokens):
- """
- Find any constituents that might cover ``span``, and add them
- to the most likely constituents table.
- :rtype: None
- :type span: tuple(int, int)
- :param span: The section of the text for which we are
- trying to find possible constituents. The span is
- specified as a pair of integers, where the first integer
- is the index of the first token that should be included in
- the constituent; and the second integer is the index of
- the first token that should not be included in the
- constituent. I.e., the constituent should cover
- ``text[span[0]:span[1]]``, where ``text`` is the text
- that we are parsing.
- :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
- :param constituents: The most likely constituents table. This
- table records the most probable tree representation for
- any given span and node value. In particular,
- ``constituents(s,e,nv)`` is the most likely
- ``ProbabilisticTree`` that covers ``text[s:e]``
- and has a node value ``nv.symbol()``, where ``text``
- is the text that we are parsing. When
- ``_add_constituents_spanning`` is called, ``constituents``
- should contain all possible constituents that are shorter
- than ``span``.
- :type tokens: list of tokens
- :param tokens: The text we are parsing. This is only used for
- trace output.
- """
- # Since some of the grammar productions may be unary, we need to
- # repeatedly try all of the productions until none of them add any
- # new constituents.
- changed = True
- while changed:
- changed = False
- # Find all ways instantiations of the grammar productions that
- # cover the span.
- instantiations = self._find_instantiations(span, constituents)
- # For each production instantiation, add a new
- # ProbabilisticTree whose probability is the product
- # of the childrens' probabilities and the production's
- # probability.
- for (production, children) in instantiations:
- subtrees = [c for c in children if isinstance(c, Tree)]
- p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob())
- node = production.lhs().symbol()
- tree = ProbabilisticTree(node, children, prob=p)
- # If it's new a constituent, then add it to the
- # constituents dictionary.
- c = constituents.get((span[0], span[1], production.lhs()))
- if self._trace > 1:
- if c is None or c != tree:
- if c is None or c.prob() < tree.prob():
- print(' Insert:', end=' ')
- else:
- print(' Discard:', end=' ')
- self._trace_production(production, p, span, len(tokens))
- if c is None or c.prob() < tree.prob():
- constituents[span[0], span[1], production.lhs()] = tree
- changed = True
- def _find_instantiations(self, span, constituents):
- """
- :return: a list of the production instantiations that cover a
- given span of the text. A "production instantiation" is
- a tuple containing a production and a list of children,
- where the production's right hand side matches the list of
- children; and the children cover ``span``. :rtype: list
- of ``pair`` of ``Production``, (list of
- (``ProbabilisticTree`` or token.
- :type span: tuple(int, int)
- :param span: The section of the text for which we are
- trying to find production instantiations. The span is
- specified as a pair of integers, where the first integer
- is the index of the first token that should be covered by
- the production instantiation; and the second integer is
- the index of the first token that should not be covered by
- the production instantiation.
- :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
- :param constituents: The most likely constituents table. This
- table records the most probable tree representation for
- any given span and node value. See the module
- documentation for more information.
- """
- rv = []
- for production in self._grammar.productions():
- childlists = self._match_rhs(production.rhs(), span, constituents)
- for childlist in childlists:
- rv.append((production, childlist))
- return rv
- def _match_rhs(self, rhs, span, constituents):
- """
- :return: a set of all the lists of children that cover ``span``
- and that match ``rhs``.
- :rtype: list(list(ProbabilisticTree or token)
- :type rhs: list(Nonterminal or any)
- :param rhs: The list specifying what kinds of children need to
- cover ``span``. Each nonterminal in ``rhs`` specifies
- that the corresponding child should be a tree whose node
- value is that nonterminal's symbol. Each terminal in ``rhs``
- specifies that the corresponding child should be a token
- whose type is that terminal.
- :type span: tuple(int, int)
- :param span: The section of the text for which we are
- trying to find child lists. The span is specified as a
- pair of integers, where the first integer is the index of
- the first token that should be covered by the child list;
- and the second integer is the index of the first token
- that should not be covered by the child list.
- :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
- :param constituents: The most likely constituents table. This
- table records the most probable tree representation for
- any given span and node value. See the module
- documentation for more information.
- """
- (start, end) = span
- # Base case
- if start >= end and rhs == ():
- return [[]]
- if start >= end or rhs == ():
- return []
- # Find everything that matches the 1st symbol of the RHS
- childlists = []
- for split in range(start, end + 1):
- l = constituents.get((start, split, rhs[0]))
- if l is not None:
- rights = self._match_rhs(rhs[1:], (split, end), constituents)
- childlists += [[l] + r for r in rights]
- return childlists
- def _trace_production(self, production, p, span, width):
- """
- Print trace output indicating that a given production has been
- applied at a given location.
- :param production: The production that has been applied
- :type production: Production
- :param p: The probability of the tree produced by the production.
- :type p: float
- :param span: The span of the production
- :type span: tuple
- :rtype: None
- """
- str = '|' + '.' * span[0]
- str += '=' * (span[1] - span[0])
- str += '.' * (width - span[1]) + '| '
- str += '%s' % production
- if self._trace > 2:
- str = '%-40s %12.10f ' % (str, p)
- print(str)
- def _trace_lexical_insertion(self, token, index, width):
- str = ' Insert: |' + '.' * index + '=' + '.' * (width - index - 1) + '| '
- str += '%s' % (token,)
- print(str)
- def __repr__(self):
- return '<ViterbiParser for %r>' % self._grammar
- ##//////////////////////////////////////////////////////
- ## Test Code
- ##//////////////////////////////////////////////////////
- def demo():
- """
- A demonstration of the probabilistic parsers. The user is
- prompted to select which demo to run, and how many parses should
- be found; and then each parser is run on the same demo, and a
- summary of the results are displayed.
- """
- import sys, time
- from nltk import tokenize
- from nltk.parse import ViterbiParser
- from nltk.grammar import toy_pcfg1, toy_pcfg2
- # Define two demos. Each demo has a sentence and a grammar.
- demos = [
- ('I saw the man with my telescope', toy_pcfg1),
- ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
- ]
- # Ask the user which demo they want to use.
- print()
- for i in range(len(demos)):
- print('%3s: %s' % (i + 1, demos[i][0]))
- print(' %r' % demos[i][1])
- print()
- print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
- try:
- snum = int(sys.stdin.readline().strip()) - 1
- sent, grammar = demos[snum]
- except:
- print('Bad sentence number')
- return
- # Tokenize the sentence.
- tokens = sent.split()
- parser = ViterbiParser(grammar)
- all_parses = {}
- print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
- parser.trace(3)
- t = time.time()
- parses = parser.parse_all(tokens)
- time = time.time() - t
- average = (
- reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
- )
- num_parses = len(parses)
- for p in parses:
- all_parses[p.freeze()] = 1
- # Print some summary statistics
- print()
- print('Time (secs) # Parses Average P(parse)')
- print('-----------------------------------------')
- print('%11.4f%11d%19.14f' % (time, num_parses, average))
- parses = all_parses.keys()
- if parses:
- p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
- else:
- p = 0
- print('------------------------------------------')
- print('%11s%11d%19.14f' % ('n/a', len(parses), p))
- # Ask the user if we should draw the parses.
- print()
- print('Draw parses (y/n)? ', end=' ')
- if sys.stdin.readline().strip().lower().startswith('y'):
- from nltk.draw.tree import draw_trees
- print(' please wait...')
- draw_trees(*parses)
- # Ask the user if we should print the parses.
- print()
- print('Print parses (y/n)? ', end=' ')
- if sys.stdin.readline().strip().lower().startswith('y'):
- for parse in parses:
- print(parse)
- if __name__ == '__main__':
- demo()
|