123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582 |
- # Natural Language Toolkit: Probabilistic Chart Parsers
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Classes and interfaces for associating probabilities with tree
- structures that represent the internal organization of a text. The
- probabilistic parser module defines ``BottomUpProbabilisticChartParser``.
- ``BottomUpProbabilisticChartParser`` is an abstract class that implements
- a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges,
- and adds them to the chart one at a time. The ordering of this queue
- is based on the probabilities associated with the edges, allowing the
- parser to expand more likely edges before less likely ones. Each
- subclass implements a different queue ordering, producing different
- search strategies. Currently the following subclasses are defined:
- - ``InsideChartParser`` searches edges in decreasing order of
- their trees' inside probabilities.
- - ``RandomChartParser`` searches edges in random order.
- - ``LongestChartParser`` searches edges in decreasing order of their
- location's length.
- The ``BottomUpProbabilisticChartParser`` constructor has an optional
- argument beam_size. If non-zero, this controls the size of the beam
- (aka the edge queue). This option is most useful with InsideChartParser.
- """
- from __future__ import print_function, unicode_literals
- ##//////////////////////////////////////////////////////
- ## Bottom-Up PCFG Chart Parser
- ##//////////////////////////////////////////////////////
- # [XX] This might not be implemented quite right -- it would be better
- # to associate probabilities with child pointer lists.
- import random
- from functools import reduce
- from nltk.tree import Tree, ProbabilisticTree
- from nltk.grammar import Nonterminal, PCFG
- from nltk.parse.api import ParserI
- from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule
- from nltk.compat import python_2_unicode_compatible
- # Probabilistic edges
- class ProbabilisticLeafEdge(LeafEdge):
- def prob(self):
- return 1.0
- class ProbabilisticTreeEdge(TreeEdge):
- def __init__(self, prob, *args, **kwargs):
- TreeEdge.__init__(self, *args, **kwargs)
- self._prob = prob
- # two edges with different probabilities are not equal.
- self._comparison_key = (self._comparison_key, prob)
- def prob(self):
- return self._prob
- @staticmethod
- def from_production(production, index, p):
- return ProbabilisticTreeEdge(
- p, (index, index), production.lhs(), production.rhs(), 0
- )
- # Rules using probabilistic edges
- class ProbabilisticBottomUpInitRule(AbstractChartRule):
- NUM_EDGES = 0
- def apply(self, chart, grammar):
- for index in range(chart.num_leaves()):
- new_edge = ProbabilisticLeafEdge(chart.leaf(index), index)
- if chart.insert(new_edge, ()):
- yield new_edge
- class ProbabilisticBottomUpPredictRule(AbstractChartRule):
- NUM_EDGES = 1
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
- for prod in grammar.productions():
- if edge.lhs() == prod.rhs()[0]:
- new_edge = ProbabilisticTreeEdge.from_production(
- prod, edge.start(), prod.prob()
- )
- if chart.insert(new_edge, ()):
- yield new_edge
- class ProbabilisticFundamentalRule(AbstractChartRule):
- NUM_EDGES = 2
- def apply(self, chart, grammar, left_edge, right_edge):
- # Make sure the rule is applicable.
- if not (
- left_edge.end() == right_edge.start()
- and left_edge.nextsym() == right_edge.lhs()
- and left_edge.is_incomplete()
- and right_edge.is_complete()
- ):
- return
- # Construct the new edge.
- p = left_edge.prob() * right_edge.prob()
- new_edge = ProbabilisticTreeEdge(
- p,
- span=(left_edge.start(), right_edge.end()),
- lhs=left_edge.lhs(),
- rhs=left_edge.rhs(),
- dot=left_edge.dot() + 1,
- )
- # Add it to the chart, with appropriate child pointers.
- changed_chart = False
- for cpl1 in chart.child_pointer_lists(left_edge):
- if chart.insert(new_edge, cpl1 + (right_edge,)):
- changed_chart = True
- # If we changed the chart, then generate the edge.
- if changed_chart:
- yield new_edge
- @python_2_unicode_compatible
- class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule):
- NUM_EDGES = 1
- _fundamental_rule = ProbabilisticFundamentalRule()
- def apply(self, chart, grammar, edge1):
- fr = self._fundamental_rule
- if edge1.is_incomplete():
- # edge1 = left_edge; edge2 = right_edge
- for edge2 in chart.select(
- start=edge1.end(), is_complete=True, lhs=edge1.nextsym()
- ):
- for new_edge in fr.apply(chart, grammar, edge1, edge2):
- yield new_edge
- else:
- # edge2 = left_edge; edge1 = right_edge
- for edge2 in chart.select(
- end=edge1.start(), is_complete=False, nextsym=edge1.lhs()
- ):
- for new_edge in fr.apply(chart, grammar, edge2, edge1):
- yield new_edge
- def __str__(self):
- return 'Fundamental Rule'
- class BottomUpProbabilisticChartParser(ParserI):
- """
- An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to
- record partial results. ``BottomUpProbabilisticChartParser`` maintains
- a queue of edges that can be added to the chart. This queue is
- initialized with edges for each token in the text that is being
- parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into
- the chart one at a time, starting with the most likely edges, and
- proceeding to less likely edges. For each edge that is added to
- the chart, it may become possible to insert additional edges into
- the chart; these are added to the queue. This process continues
- until enough complete parses have been generated, or until the
- queue is empty.
- The sorting order for the queue is not specified by
- ``BottomUpProbabilisticChartParser``. Different sorting orders will
- result in different search strategies. The sorting order for the
- queue is defined by the method ``sort_queue``; subclasses are required
- to provide a definition for this method.
- :type _grammar: PCFG
- :ivar _grammar: The grammar used to parse sentences.
- :type _trace: int
- :ivar _trace: The level of tracing output that should be generated
- when parsing a text.
- """
- def __init__(self, grammar, beam_size=0, trace=0):
- """
- Create a new ``BottomUpProbabilisticChartParser``, that uses
- ``grammar`` to parse texts.
- :type grammar: PCFG
- :param grammar: The grammar used to parse texts.
- :type beam_size: int
- :param beam_size: The maximum length for the parser's edge queue.
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- and higher numbers will produce more verbose tracing
- output.
- """
- if not isinstance(grammar, PCFG):
- raise ValueError("The grammar must be probabilistic PCFG")
- self._grammar = grammar
- self.beam_size = beam_size
- self._trace = trace
- def grammar(self):
- return self._grammar
- def trace(self, trace=2):
- """
- Set the level of tracing output that should be generated when
- parsing a text.
- :type trace: int
- :param trace: The trace level. A trace level of ``0`` will
- generate no tracing output; and higher trace levels will
- produce more verbose tracing output.
- :rtype: None
- """
- self._trace = trace
- # TODO: change this to conform more with the standard ChartParser
- def parse(self, tokens):
- self._grammar.check_coverage(tokens)
- chart = Chart(list(tokens))
- grammar = self._grammar
- # Chart parser rules.
- bu_init = ProbabilisticBottomUpInitRule()
- bu = ProbabilisticBottomUpPredictRule()
- fr = SingleEdgeProbabilisticFundamentalRule()
- # Our queue
- queue = []
- # Initialize the chart.
- for edge in bu_init.apply(chart, grammar):
- if self._trace > 1:
- print(
- ' %-50s [%s]'
- % (chart.pretty_format_edge(edge, width=2), edge.prob())
- )
- queue.append(edge)
- while len(queue) > 0:
- # Re-sort the queue.
- self.sort_queue(queue, chart)
- # Prune the queue to the correct size if a beam was defined
- if self.beam_size:
- self._prune(queue, chart)
- # Get the best edge.
- edge = queue.pop()
- if self._trace > 0:
- print(
- ' %-50s [%s]'
- % (chart.pretty_format_edge(edge, width=2), edge.prob())
- )
- # Apply BU & FR to it.
- queue.extend(bu.apply(chart, grammar, edge))
- queue.extend(fr.apply(chart, grammar, edge))
- # Get a list of complete parses.
- parses = list(chart.parses(grammar.start(), ProbabilisticTree))
- # Assign probabilities to the trees.
- prod_probs = {}
- for prod in grammar.productions():
- prod_probs[prod.lhs(), prod.rhs()] = prod.prob()
- for parse in parses:
- self._setprob(parse, prod_probs)
- # Sort by probability
- parses.sort(reverse=True, key=lambda tree: tree.prob())
- return iter(parses)
- def _setprob(self, tree, prod_probs):
- if tree.prob() is not None:
- return
- # Get the prob of the CFG production.
- lhs = Nonterminal(tree.label())
- rhs = []
- for child in tree:
- if isinstance(child, Tree):
- rhs.append(Nonterminal(child.label()))
- else:
- rhs.append(child)
- prob = prod_probs[lhs, tuple(rhs)]
- # Get the probs of children.
- for child in tree:
- if isinstance(child, Tree):
- self._setprob(child, prod_probs)
- prob *= child.prob()
- tree.set_prob(prob)
- def sort_queue(self, queue, chart):
- """
- Sort the given queue of ``Edge`` objects, placing the edge that should
- be tried first at the beginning of the queue. This method
- will be called after each ``Edge`` is added to the queue.
- :param queue: The queue of ``Edge`` objects to sort. Each edge in
- this queue is an edge that could be added to the chart by
- the fundamental rule; but that has not yet been added.
- :type queue: list(Edge)
- :param chart: The chart being used to parse the text. This
- chart can be used to provide extra information for sorting
- the queue.
- :type chart: Chart
- :rtype: None
- """
- raise NotImplementedError()
- def _prune(self, queue, chart):
- """ Discard items in the queue if the queue is longer than the beam."""
- if len(queue) > self.beam_size:
- split = len(queue) - self.beam_size
- if self._trace > 2:
- for edge in queue[:split]:
- print(' %-50s [DISCARDED]' % chart.pretty_format_edge(edge, 2))
- del queue[:split]
- class InsideChartParser(BottomUpProbabilisticChartParser):
- """
- A bottom-up parser for ``PCFG`` grammars that tries edges in descending
- order of the inside probabilities of their trees. The "inside
- probability" of a tree is simply the
- probability of the entire tree, ignoring its context. In
- particular, the inside probability of a tree generated by
- production *p* with children *c[1], c[2], ..., c[n]* is
- *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside
- probability of a token is 1 if it is present in the text, and 0 if
- it is absent.
- This sorting order results in a type of lowest-cost-first search
- strategy.
- """
- # Inherit constructor.
- def sort_queue(self, queue, chart):
- """
- Sort the given queue of edges, in descending order of the
- inside probabilities of the edges' trees.
- :param queue: The queue of ``Edge`` objects to sort. Each edge in
- this queue is an edge that could be added to the chart by
- the fundamental rule; but that has not yet been added.
- :type queue: list(Edge)
- :param chart: The chart being used to parse the text. This
- chart can be used to provide extra information for sorting
- the queue.
- :type chart: Chart
- :rtype: None
- """
- queue.sort(key=lambda edge: edge.prob())
- # Eventually, this will become some sort of inside-outside parser:
- # class InsideOutsideParser(BottomUpProbabilisticChartParser):
- # def __init__(self, grammar, trace=0):
- # # Inherit docs.
- # BottomUpProbabilisticChartParser.__init__(self, grammar, trace)
- #
- # # Find the best path from S to each nonterminal
- # bestp = {}
- # for production in grammar.productions(): bestp[production.lhs()]=0
- # bestp[grammar.start()] = 1.0
- #
- # for i in range(len(grammar.productions())):
- # for production in grammar.productions():
- # lhs = production.lhs()
- # for elt in production.rhs():
- # bestp[elt] = max(bestp[lhs]*production.prob(),
- # bestp.get(elt,0))
- #
- # self._bestp = bestp
- # for (k,v) in self._bestp.items(): print k,v
- #
- # def _sortkey(self, edge):
- # return edge.structure()[PROB] * self._bestp[edge.lhs()]
- #
- # def sort_queue(self, queue, chart):
- # queue.sort(key=self._sortkey)
- class RandomChartParser(BottomUpProbabilisticChartParser):
- """
- A bottom-up parser for ``PCFG`` grammars that tries edges in random order.
- This sorting order results in a random search strategy.
- """
- # Inherit constructor
- def sort_queue(self, queue, chart):
- i = random.randint(0, len(queue) - 1)
- (queue[-1], queue[i]) = (queue[i], queue[-1])
- class UnsortedChartParser(BottomUpProbabilisticChartParser):
- """
- A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order.
- """
- # Inherit constructor
- def sort_queue(self, queue, chart):
- return
- class LongestChartParser(BottomUpProbabilisticChartParser):
- """
- A bottom-up parser for ``PCFG`` grammars that tries longer edges before
- shorter ones. This sorting order results in a type of best-first
- search strategy.
- """
- # Inherit constructor
- def sort_queue(self, queue, chart):
- queue.sort(key=lambda edge: edge.length())
- ##//////////////////////////////////////////////////////
- ## Test Code
- ##//////////////////////////////////////////////////////
- def demo(choice=None, draw_parses=None, print_parses=None):
- """
- A demonstration of the probabilistic parsers. The user is
- prompted to select which demo to run, and how many parses should
- be found; and then each parser is run on the same demo, and a
- summary of the results are displayed.
- """
- import sys, time
- from nltk import tokenize
- from nltk.parse import pchart
- # Define two demos. Each demo has a sentence and a grammar.
- toy_pcfg1 = PCFG.fromstring(
- """
- S -> NP VP [1.0]
- NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
- Det -> 'the' [0.8] | 'my' [0.2]
- N -> 'man' [0.5] | 'telescope' [0.5]
- VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
- V -> 'ate' [0.35] | 'saw' [0.65]
- PP -> P NP [1.0]
- P -> 'with' [0.61] | 'under' [0.39]
- """
- )
- toy_pcfg2 = PCFG.fromstring(
- """
- S -> NP VP [1.0]
- VP -> V NP [.59]
- VP -> V [.40]
- VP -> VP PP [.01]
- NP -> Det N [.41]
- NP -> Name [.28]
- NP -> NP PP [.31]
- PP -> P NP [1.0]
- V -> 'saw' [.21]
- V -> 'ate' [.51]
- V -> 'ran' [.28]
- N -> 'boy' [.11]
- N -> 'cookie' [.12]
- N -> 'table' [.13]
- N -> 'telescope' [.14]
- N -> 'hill' [.5]
- Name -> 'Jack' [.52]
- Name -> 'Bob' [.48]
- P -> 'with' [.61]
- P -> 'under' [.39]
- Det -> 'the' [.41]
- Det -> 'a' [.31]
- Det -> 'my' [.28]
- """
- )
- demos = [
- ('I saw John with my telescope', toy_pcfg1),
- ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2),
- ]
- if choice is None:
- # Ask the user which demo they want to use.
- print()
- for i in range(len(demos)):
- print('%3s: %s' % (i + 1, demos[i][0]))
- print(' %r' % demos[i][1])
- print()
- print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ')
- choice = int(sys.stdin.readline().strip()) - 1
- try:
- sent, grammar = demos[choice]
- except:
- print('Bad sentence number')
- return
- # Tokenize the sentence.
- tokens = sent.split()
- # Define a list of parsers. We'll use all parsers.
- parsers = [
- pchart.InsideChartParser(grammar),
- pchart.RandomChartParser(grammar),
- pchart.UnsortedChartParser(grammar),
- pchart.LongestChartParser(grammar),
- pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser
- ]
- # Run the parsers on the tokenized sentence.
- times = []
- average_p = []
- num_parses = []
- all_parses = {}
- for parser in parsers:
- print('\ns: %s\nparser: %s\ngrammar: %s' % (sent, parser, grammar))
- parser.trace(3)
- t = time.time()
- parses = list(parser.parse(tokens))
- times.append(time.time() - t)
- p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0
- average_p.append(p)
- num_parses.append(len(parses))
- for p in parses:
- all_parses[p.freeze()] = 1
- # Print some summary statistics
- print()
- print(' Parser Beam | Time (secs) # Parses Average P(parse)')
- print('------------------------+------------------------------------------')
- for i in range(len(parsers)):
- print(
- '%18s %4d |%11.4f%11d%19.14f'
- % (
- parsers[i].__class__.__name__,
- parsers[i].beam_size,
- times[i],
- num_parses[i],
- average_p[i],
- )
- )
- parses = all_parses.keys()
- if parses:
- p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses)
- else:
- p = 0
- print('------------------------+------------------------------------------')
- print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p))
- if draw_parses is None:
- # Ask the user if we should draw the parses.
- print()
- print('Draw parses (y/n)? ', end=' ')
- draw_parses = sys.stdin.readline().strip().lower().startswith('y')
- if draw_parses:
- from nltk.draw.tree import draw_trees
- print(' please wait...')
- draw_trees(*parses)
- if print_parses is None:
- # Ask the user if we should print the parses.
- print()
- print('Print parses (y/n)? ', end=' ')
- print_parses = sys.stdin.readline().strip().lower().startswith('y')
- if print_parses:
- for parse in parses:
- print(parse)
- if __name__ == '__main__':
- demo()
|