123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Chart Parser for Feature-Based Grammars
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Rob Speer <rspeer@mit.edu>
- # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Extension of chart parsing implementation to handle grammars with
- feature structures as nodes.
- """
- from __future__ import print_function, unicode_literals
- from six.moves import range
- from nltk.compat import python_2_unicode_compatible
- from nltk.featstruct import FeatStruct, unify, TYPE, find_variables
- from nltk.sem import logic
- from nltk.tree import Tree
- from nltk.grammar import (
- Nonterminal,
- Production,
- CFG,
- FeatStructNonterminal,
- is_nonterminal,
- is_terminal,
- )
- from nltk.parse.chart import (
- TreeEdge,
- Chart,
- ChartParser,
- EdgeI,
- FundamentalRule,
- LeafInitRule,
- EmptyPredictRule,
- BottomUpPredictRule,
- SingleEdgeFundamentalRule,
- BottomUpPredictCombineRule,
- CachedTopDownPredictRule,
- TopDownInitRule,
- )
- # ////////////////////////////////////////////////////////////
- # Tree Edge
- # ////////////////////////////////////////////////////////////
- @python_2_unicode_compatible
- class FeatureTreeEdge(TreeEdge):
- """
- A specialized tree edge that allows shared variable bindings
- between nonterminals on the left-hand side and right-hand side.
- Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a
- dictionary mapping from variables to values. If the edge is not
- complete, then these bindings are simply stored. However, if the
- edge is complete, then the constructor applies these bindings to
- every nonterminal in the edge whose symbol implements the
- interface ``SubstituteBindingsI``.
- """
- def __init__(self, span, lhs, rhs, dot=0, bindings=None):
- """
- Construct a new edge. If the edge is incomplete (i.e., if
- ``dot<len(rhs)``), then store the bindings as-is. If the edge
- is complete (i.e., if ``dot==len(rhs)``), then apply the
- bindings to all nonterminals in ``lhs`` and ``rhs``, and then
- clear the bindings. See ``TreeEdge`` for a description of
- the other arguments.
- """
- if bindings is None:
- bindings = {}
- # If the edge is complete, then substitute in the bindings,
- # and then throw them away. (If we didn't throw them away, we
- # might think that 2 complete edges are different just because
- # they have different bindings, even though all bindings have
- # already been applied.)
- if dot == len(rhs) and bindings:
- lhs = self._bind(lhs, bindings)
- rhs = [self._bind(elt, bindings) for elt in rhs]
- bindings = {}
- # Initialize the edge.
- TreeEdge.__init__(self, span, lhs, rhs, dot)
- self._bindings = bindings
- self._comparison_key = (self._comparison_key, tuple(sorted(bindings.items())))
- @staticmethod
- def from_production(production, index):
- """
- :return: A new ``TreeEdge`` formed from the given production.
- The new edge's left-hand side and right-hand side will
- be taken from ``production``; its span will be
- ``(index,index)``; and its dot position will be ``0``.
- :rtype: TreeEdge
- """
- return FeatureTreeEdge(
- span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
- )
- def move_dot_forward(self, new_end, bindings=None):
- """
- :return: A new ``FeatureTreeEdge`` formed from this edge.
- The new edge's dot position is increased by ``1``,
- and its end index will be replaced by ``new_end``.
- :rtype: FeatureTreeEdge
- :param new_end: The new end index.
- :type new_end: int
- :param bindings: Bindings for the new edge.
- :type bindings: dict
- """
- return FeatureTreeEdge(
- span=(self._span[0], new_end),
- lhs=self._lhs,
- rhs=self._rhs,
- dot=self._dot + 1,
- bindings=bindings,
- )
- def _bind(self, nt, bindings):
- if not isinstance(nt, FeatStructNonterminal):
- return nt
- return nt.substitute_bindings(bindings)
- def next_with_bindings(self):
- return self._bind(self.nextsym(), self._bindings)
- def bindings(self):
- """
- Return a copy of this edge's bindings dictionary.
- """
- return self._bindings.copy()
- def variables(self):
- """
- :return: The set of variables used by this edge.
- :rtype: set(Variable)
- """
- return find_variables(
- [self._lhs]
- + list(self._rhs)
- + list(self._bindings.keys())
- + list(self._bindings.values()),
- fs_class=FeatStruct,
- )
- def __str__(self):
- if self.is_complete():
- return TreeEdge.__unicode__(self)
- else:
- bindings = '{%s}' % ', '.join(
- '%s: %r' % item for item in sorted(self._bindings.items())
- )
- return '%s %s' % (TreeEdge.__unicode__(self), bindings)
- # ////////////////////////////////////////////////////////////
- # A specialized Chart for feature grammars
- # ////////////////////////////////////////////////////////////
- # TODO: subsumes check when adding new edges
- class FeatureChart(Chart):
- """
- A Chart for feature grammars.
- :see: ``Chart`` for more information.
- """
- def select(self, **restrictions):
- """
- Returns an iterator over the edges in this chart.
- See ``Chart.select`` for more information about the
- ``restrictions`` on the edges.
- """
- # If there are no restrictions, then return all edges.
- if restrictions == {}:
- return iter(self._edges)
- # Find the index corresponding to the given restrictions.
- restr_keys = sorted(restrictions.keys())
- restr_keys = tuple(restr_keys)
- # If it doesn't exist, then create it.
- if restr_keys not in self._indexes:
- self._add_index(restr_keys)
- vals = tuple(
- self._get_type_if_possible(restrictions[key]) for key in restr_keys
- )
- return iter(self._indexes[restr_keys].get(vals, []))
- def _add_index(self, restr_keys):
- """
- A helper function for ``select``, which creates a new index for
- a given set of attributes (aka restriction keys).
- """
- # Make sure it's a valid index.
- for key in restr_keys:
- if not hasattr(EdgeI, key):
- raise ValueError('Bad restriction: %s' % key)
- # Create the index.
- index = self._indexes[restr_keys] = {}
- # Add all existing edges to the index.
- for edge in self._edges:
- vals = tuple(
- self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
- )
- index.setdefault(vals, []).append(edge)
- def _register_with_indexes(self, edge):
- """
- A helper function for ``insert``, which registers the new
- edge with all existing indexes.
- """
- for (restr_keys, index) in self._indexes.items():
- vals = tuple(
- self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
- )
- index.setdefault(vals, []).append(edge)
- def _get_type_if_possible(self, item):
- """
- Helper function which returns the ``TYPE`` feature of the ``item``,
- if it exists, otherwise it returns the ``item`` itself
- """
- if isinstance(item, dict) and TYPE in item:
- return item[TYPE]
- else:
- return item
- def parses(self, start, tree_class=Tree):
- for edge in self.select(start=0, end=self._num_leaves):
- if (
- (isinstance(edge, FeatureTreeEdge))
- and (edge.lhs()[TYPE] == start[TYPE])
- and (unify(edge.lhs(), start, rename_vars=True))
- ):
- for tree in self.trees(edge, complete=True, tree_class=tree_class):
- yield tree
- # ////////////////////////////////////////////////////////////
- # Fundamental Rule
- # ////////////////////////////////////////////////////////////
- class FeatureFundamentalRule(FundamentalRule):
- """
- A specialized version of the fundamental rule that operates on
- nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
- tha simply comparing the nonterminals for equality, they are
- unified. Variable bindings from these unifications are collected
- and stored in the chart using a ``FeatureTreeEdge``. When a
- complete edge is generated, these bindings are applied to all
- nonterminals in the edge.
- The fundamental rule states that:
- - ``[A -> alpha \* B1 beta][i:j]``
- - ``[B2 -> gamma \*][j:k]``
- licenses the edge:
- - ``[A -> alpha B3 \* beta][i:j]``
- assuming that B1 and B2 can be unified to generate B3.
- """
- def apply(self, chart, grammar, left_edge, right_edge):
- # Make sure the rule is applicable.
- if not (
- left_edge.end() == right_edge.start()
- and left_edge.is_incomplete()
- and right_edge.is_complete()
- and isinstance(left_edge, FeatureTreeEdge)
- ):
- return
- found = right_edge.lhs()
- nextsym = left_edge.nextsym()
- if isinstance(right_edge, FeatureTreeEdge):
- if not is_nonterminal(nextsym):
- return
- if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]:
- return
- # Create a copy of the bindings.
- bindings = left_edge.bindings()
- # We rename vars here, because we don't want variables
- # from the two different productions to match.
- found = found.rename_variables(used_vars=left_edge.variables())
- # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to
- # generate B3 (result).
- result = unify(nextsym, found, bindings, rename_vars=False)
- if result is None:
- return
- else:
- if nextsym != found:
- return
- # Create a copy of the bindings.
- bindings = left_edge.bindings()
- # Construct the new edge.
- new_edge = left_edge.move_dot_forward(right_edge.end(), bindings)
- # Add it to the chart, with appropriate child pointers.
- if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
- yield new_edge
- class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
- """
- A specialized version of the completer / single edge fundamental rule
- that operates on nonterminals whose symbols are ``FeatStructNonterminal``s.
- Rather than simply comparing the nonterminals for equality, they are
- unified.
- """
- _fundamental_rule = FeatureFundamentalRule()
- def _apply_complete(self, chart, grammar, right_edge):
- fr = self._fundamental_rule
- for left_edge in chart.select(
- end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
- ):
- for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
- yield new_edge
- def _apply_incomplete(self, chart, grammar, left_edge):
- fr = self._fundamental_rule
- for right_edge in chart.select(
- start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
- ):
- for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
- yield new_edge
- # ////////////////////////////////////////////////////////////
- # Top-Down Prediction
- # ////////////////////////////////////////////////////////////
- class FeatureTopDownInitRule(TopDownInitRule):
- def apply(self, chart, grammar):
- for prod in grammar.productions(lhs=grammar.start()):
- new_edge = FeatureTreeEdge.from_production(prod, 0)
- if chart.insert(new_edge, ()):
- yield new_edge
- class FeatureTopDownPredictRule(CachedTopDownPredictRule):
- """
- A specialized version of the (cached) top down predict rule that operates
- on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather
- than simply comparing the nonterminals for equality, they are
- unified.
- The top down expand rule states that:
- - ``[A -> alpha \* B1 beta][i:j]``
- licenses the edge:
- - ``[B2 -> \* gamma][j:j]``
- for each grammar production ``B2 -> gamma``, assuming that B1
- and B2 can be unified.
- """
- def apply(self, chart, grammar, edge):
- if edge.is_complete():
- return
- nextsym, index = edge.nextsym(), edge.end()
- if not is_nonterminal(nextsym):
- return
- # If we've already applied this rule to an edge with the same
- # next & end, and the chart & grammar have not changed, then
- # just return (no new edges to add).
- nextsym_with_bindings = edge.next_with_bindings()
- done = self._done.get((nextsym_with_bindings, index), (None, None))
- if done[0] is chart and done[1] is grammar:
- return
- for prod in grammar.productions(lhs=nextsym):
- # If the left corner in the predicted production is
- # leaf, it must match with the input.
- if prod.rhs():
- first = prod.rhs()[0]
- if is_terminal(first):
- if index >= chart.num_leaves():
- continue
- if first != chart.leaf(index):
- continue
- # We rename vars here, because we don't want variables
- # from the two different productions to match.
- if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True):
- new_edge = FeatureTreeEdge.from_production(prod, edge.end())
- if chart.insert(new_edge, ()):
- yield new_edge
- # Record the fact that we've applied this rule.
- self._done[nextsym_with_bindings, index] = (chart, grammar)
- # ////////////////////////////////////////////////////////////
- # Bottom-Up Prediction
- # ////////////////////////////////////////////////////////////
- class FeatureBottomUpPredictRule(BottomUpPredictRule):
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
- for prod in grammar.productions(rhs=edge.lhs()):
- if isinstance(edge, FeatureTreeEdge):
- _next = prod.rhs()[0]
- if not is_nonterminal(_next):
- continue
- new_edge = FeatureTreeEdge.from_production(prod, edge.start())
- if chart.insert(new_edge, ()):
- yield new_edge
- class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule):
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
- found = edge.lhs()
- for prod in grammar.productions(rhs=found):
- bindings = {}
- if isinstance(edge, FeatureTreeEdge):
- _next = prod.rhs()[0]
- if not is_nonterminal(_next):
- continue
- # We rename vars here, because we don't want variables
- # from the two different productions to match.
- used_vars = find_variables(
- (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct
- )
- found = found.rename_variables(used_vars=used_vars)
- result = unify(_next, found, bindings, rename_vars=False)
- if result is None:
- continue
- new_edge = FeatureTreeEdge.from_production(
- prod, edge.start()
- ).move_dot_forward(edge.end(), bindings)
- if chart.insert(new_edge, (edge,)):
- yield new_edge
- class FeatureEmptyPredictRule(EmptyPredictRule):
- def apply(self, chart, grammar):
- for prod in grammar.productions(empty=True):
- for index in range(chart.num_leaves() + 1):
- new_edge = FeatureTreeEdge.from_production(prod, index)
- if chart.insert(new_edge, ()):
- yield new_edge
- # ////////////////////////////////////////////////////////////
- # Feature Chart Parser
- # ////////////////////////////////////////////////////////////
- TD_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureTopDownInitRule(),
- FeatureTopDownPredictRule(),
- FeatureSingleEdgeFundamentalRule(),
- ]
- BU_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureEmptyPredictRule(),
- FeatureBottomUpPredictRule(),
- FeatureSingleEdgeFundamentalRule(),
- ]
- BU_LC_FEATURE_STRATEGY = [
- LeafInitRule(),
- FeatureEmptyPredictRule(),
- FeatureBottomUpPredictCombineRule(),
- FeatureSingleEdgeFundamentalRule(),
- ]
- class FeatureChartParser(ChartParser):
- def __init__(
- self,
- grammar,
- strategy=BU_LC_FEATURE_STRATEGY,
- trace_chart_width=20,
- chart_class=FeatureChart,
- **parser_args
- ):
- ChartParser.__init__(
- self,
- grammar,
- strategy=strategy,
- trace_chart_width=trace_chart_width,
- chart_class=chart_class,
- **parser_args
- )
- class FeatureTopDownChartParser(FeatureChartParser):
- def __init__(self, grammar, **parser_args):
- FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args)
- class FeatureBottomUpChartParser(FeatureChartParser):
- def __init__(self, grammar, **parser_args):
- FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args)
- class FeatureBottomUpLeftCornerChartParser(FeatureChartParser):
- def __init__(self, grammar, **parser_args):
- FeatureChartParser.__init__(
- self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args
- )
- # ////////////////////////////////////////////////////////////
- # Instantiate Variable Chart
- # ////////////////////////////////////////////////////////////
- class InstantiateVarsChart(FeatureChart):
- """
- A specialized chart that 'instantiates' variables whose names
- start with '@', by replacing them with unique new variables.
- In particular, whenever a complete edge is added to the chart, any
- variables in the edge's ``lhs`` whose names start with '@' will be
- replaced by unique new ``Variable``s.
- """
- def __init__(self, tokens):
- FeatureChart.__init__(self, tokens)
- def initialize(self):
- self._instantiated = set()
- FeatureChart.initialize(self)
- def insert(self, edge, child_pointer_list):
- if edge in self._instantiated:
- return False
- self.instantiate_edge(edge)
- return FeatureChart.insert(self, edge, child_pointer_list)
- def instantiate_edge(self, edge):
- """
- If the edge is a ``FeatureTreeEdge``, and it is complete,
- then instantiate all variables whose names start with '@',
- by replacing them with unique new variables.
- Note that instantiation is done in-place, since the
- parsing algorithms might already hold a reference to
- the edge for future use.
- """
- # If the edge is a leaf, or is not complete, or is
- # already in the chart, then just return it as-is.
- if not isinstance(edge, FeatureTreeEdge):
- return
- if not edge.is_complete():
- return
- if edge in self._edge_to_cpls:
- return
- # Get a list of variables that need to be instantiated.
- # If there are none, then return as-is.
- inst_vars = self.inst_vars(edge)
- if not inst_vars:
- return
- # Instantiate the edge!
- self._instantiated.add(edge)
- edge._lhs = edge.lhs().substitute_bindings(inst_vars)
- def inst_vars(self, edge):
- return dict(
- (var, logic.unique_variable())
- for var in edge.lhs().variables()
- if var.name.startswith('@')
- )
- # ////////////////////////////////////////////////////////////
- # Demo
- # ////////////////////////////////////////////////////////////
- def demo_grammar():
- from nltk.grammar import FeatureGrammar
- return FeatureGrammar.fromstring(
- """
- S -> NP VP
- PP -> Prep NP
- NP -> NP PP
- VP -> VP PP
- VP -> Verb NP
- VP -> Verb
- NP -> Det[pl=?x] Noun[pl=?x]
- NP -> "John"
- NP -> "I"
- Det -> "the"
- Det -> "my"
- Det[-pl] -> "a"
- Noun[-pl] -> "dog"
- Noun[-pl] -> "cookie"
- Verb -> "ate"
- Verb -> "saw"
- Prep -> "with"
- Prep -> "under"
- """
- )
- def demo(
- print_times=True,
- print_grammar=True,
- print_trees=True,
- print_sentence=True,
- trace=1,
- parser=FeatureChartParser,
- sent='I saw John with a dog with my cookie',
- ):
- import sys, time
- print()
- grammar = demo_grammar()
- if print_grammar:
- print(grammar)
- print()
- print("*", parser.__name__)
- if print_sentence:
- print("Sentence:", sent)
- tokens = sent.split()
- t = time.clock()
- cp = parser(grammar, trace=trace)
- chart = cp.chart_parse(tokens)
- trees = list(chart.parses(grammar.start()))
- if print_times:
- print("Time: %s" % (time.clock() - t))
- if print_trees:
- for tree in trees:
- print(tree)
- else:
- print("Nr trees:", len(trees))
- def run_profile():
- import profile
- profile.run('for i in range(1): demo()', '/tmp/profile.out')
- import pstats
- p = pstats.Stats('/tmp/profile.out')
- p.strip_dirs().sort_stats('time', 'cum').print_stats(60)
- p.strip_dirs().sort_stats('cum', 'time').print_stats(60)
- if __name__ == '__main__':
- from nltk.data import load
- demo()
- print()
- grammar = load('grammars/book_grammars/feat0.fcfg')
- cp = FeatureChartParser(grammar, trace=2)
- sent = 'Kim likes children'
- tokens = sent.split()
- trees = cp.parse(tokens)
- for tree in trees:
- print(tree)
|