1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: A Chart Parser
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com>
- # Jean Mark Gawron <gawron@mail.sdsu.edu>
- # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Data classes and parser implementations for "chart parsers", which
- use dynamic programming to efficiently parse a text. A chart
- parser derives parse trees for a text by iteratively adding "edges"
- to a "chart." Each edge represents a hypothesis about the tree
- structure for a subsequence of the text. The chart is a
- "blackboard" for composing and combining these hypotheses.
- When a chart parser begins parsing a text, it creates a new (empty)
- chart, spanning the text. It then incrementally adds new edges to the
- chart. A set of "chart rules" specifies the conditions under which
- new edges should be added to the chart. Once the chart reaches a
- stage where none of the chart rules adds any new edges, parsing is
- complete.
- Charts are encoded with the ``Chart`` class, and edges are encoded with
- the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module
- defines three chart parsers:
- - ``ChartParser`` is a simple and flexible chart parser. Given a
- set of chart rules, it will apply those rules to the chart until
- no more edges are added.
- - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
- be used to step through the parsing process.
- """
- from __future__ import print_function, division, unicode_literals
- import itertools
- import re
- import warnings
- from functools import total_ordering
- from six.moves import range
- from nltk.tree import Tree
- from nltk.grammar import PCFG, is_nonterminal, is_terminal
- from nltk.util import OrderedDict
- from nltk.internals import raise_unorderable_types
- from nltk.compat import python_2_unicode_compatible, unicode_repr
- from nltk.parse.api import ParserI
- ########################################################################
- ## Edges
- ########################################################################
- @total_ordering
- class EdgeI(object):
- """
- A hypothesis about the structure of part of a sentence.
- Each edge records the fact that a structure is (partially)
- consistent with the sentence. An edge contains:
- - A span, indicating what part of the sentence is
- consistent with the hypothesized structure.
- - A left-hand side, specifying what kind of structure is
- hypothesized.
- - A right-hand side, specifying the contents of the
- hypothesized structure.
- - A dot position, indicating how much of the hypothesized
- structure is consistent with the sentence.
- Every edge is either complete or incomplete:
- - An edge is complete if its structure is fully consistent
- with the sentence.
- - An edge is incomplete if its structure is partially
- consistent with the sentence. For every incomplete edge, the
- span specifies a possible prefix for the edge's structure.
- There are two kinds of edge:
- - A ``TreeEdge`` records which trees have been found to
- be (partially) consistent with the text.
- - A ``LeafEdge`` records the tokens occurring in the text.
- The ``EdgeI`` interface provides a common interface to both types
- of edge, allowing chart parsers to treat them in a uniform manner.
- """
- def __init__(self):
- if self.__class__ == EdgeI:
- raise TypeError('Edge is an abstract interface')
- # ////////////////////////////////////////////////////////////
- # Span
- # ////////////////////////////////////////////////////////////
- def span(self):
- """
- Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the
- portion of the sentence that is consistent with this
- edge's structure.
- :rtype: tuple(int, int)
- """
- raise NotImplementedError()
- def start(self):
- """
- Return the start index of this edge's span.
- :rtype: int
- """
- raise NotImplementedError()
- def end(self):
- """
- Return the end index of this edge's span.
- :rtype: int
- """
- raise NotImplementedError()
- def length(self):
- """
- Return the length of this edge's span.
- :rtype: int
- """
- raise NotImplementedError()
- # ////////////////////////////////////////////////////////////
- # Left Hand Side
- # ////////////////////////////////////////////////////////////
- def lhs(self):
- """
- Return this edge's left-hand side, which specifies what kind
- of structure is hypothesized by this edge.
- :see: ``TreeEdge`` and ``LeafEdge`` for a description of
- the left-hand side values for each edge type.
- """
- raise NotImplementedError()
- # ////////////////////////////////////////////////////////////
- # Right Hand Side
- # ////////////////////////////////////////////////////////////
- def rhs(self):
- """
- Return this edge's right-hand side, which specifies
- the content of the structure hypothesized by this edge.
- :see: ``TreeEdge`` and ``LeafEdge`` for a description of
- the right-hand side values for each edge type.
- """
- raise NotImplementedError()
- def dot(self):
- """
- Return this edge's dot position, which indicates how much of
- the hypothesized structure is consistent with the
- sentence. In particular, ``self.rhs[:dot]`` is consistent
- with ``tokens[self.start():self.end()]``.
- :rtype: int
- """
- raise NotImplementedError()
- def nextsym(self):
- """
- Return the element of this edge's right-hand side that
- immediately follows its dot.
- :rtype: Nonterminal or terminal or None
- """
- raise NotImplementedError()
- def is_complete(self):
- """
- Return True if this edge's structure is fully consistent
- with the text.
- :rtype: bool
- """
- raise NotImplementedError()
- def is_incomplete(self):
- """
- Return True if this edge's structure is partially consistent
- with the text.
- :rtype: bool
- """
- raise NotImplementedError()
- # ////////////////////////////////////////////////////////////
- # Comparisons & hashing
- # ////////////////////////////////////////////////////////////
- def __eq__(self, other):
- return (
- self.__class__ is other.__class__
- and self._comparison_key == other._comparison_key
- )
- def __ne__(self, other):
- return not self == other
- def __lt__(self, other):
- if not isinstance(other, EdgeI):
- raise_unorderable_types("<", self, other)
- if self.__class__ is other.__class__:
- return self._comparison_key < other._comparison_key
- else:
- return self.__class__.__name__ < other.__class__.__name__
- def __hash__(self):
- try:
- return self._hash
- except AttributeError:
- self._hash = hash(self._comparison_key)
- return self._hash
- @python_2_unicode_compatible
- class TreeEdge(EdgeI):
- """
- An edge that records the fact that a tree is (partially)
- consistent with the sentence. A tree edge consists of:
- - A span, indicating what part of the sentence is
- consistent with the hypothesized tree.
- - A left-hand side, specifying the hypothesized tree's node
- value.
- - A right-hand side, specifying the hypothesized tree's
- children. Each element of the right-hand side is either a
- terminal, specifying a token with that terminal as its leaf
- value; or a nonterminal, specifying a subtree with that
- nonterminal's symbol as its node value.
- - A dot position, indicating which children are consistent
- with part of the sentence. In particular, if ``dot`` is the
- dot position, ``rhs`` is the right-hand size, ``(start,end)``
- is the span, and ``sentence`` is the list of tokens in the
- sentence, then ``tokens[start:end]`` can be spanned by the
- children specified by ``rhs[:dot]``.
- For more information about edges, see the ``EdgeI`` interface.
- """
- def __init__(self, span, lhs, rhs, dot=0):
- """
- Construct a new ``TreeEdge``.
- :type span: tuple(int, int)
- :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the
- portion of the sentence that is consistent with the new
- edge's structure.
- :type lhs: Nonterminal
- :param lhs: The new edge's left-hand side, specifying the
- hypothesized tree's node value.
- :type rhs: list(Nonterminal and str)
- :param rhs: The new edge's right-hand side, specifying the
- hypothesized tree's children.
- :type dot: int
- :param dot: The position of the new edge's dot. This position
- specifies what prefix of the production's right hand side
- is consistent with the text. In particular, if
- ``sentence`` is the list of tokens in the sentence, then
- ``okens[span[0]:span[1]]`` can be spanned by the
- children specified by ``rhs[:dot]``.
- """
- self._span = span
- self._lhs = lhs
- rhs = tuple(rhs)
- self._rhs = rhs
- self._dot = dot
- self._comparison_key = (span, lhs, rhs, dot)
- @staticmethod
- def from_production(production, index):
- """
- Return a new ``TreeEdge`` formed from the given production.
- The new edge's left-hand side and right-hand side will
- be taken from ``production``; its span will be
- ``(index,index)``; and its dot position will be ``0``.
- :rtype: TreeEdge
- """
- return TreeEdge(
- span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
- )
- def move_dot_forward(self, new_end):
- """
- Return a new ``TreeEdge`` formed from this edge.
- The new edge's dot position is increased by ``1``,
- and its end index will be replaced by ``new_end``.
- :param new_end: The new end index.
- :type new_end: int
- :rtype: TreeEdge
- """
- return TreeEdge(
- span=(self._span[0], new_end),
- lhs=self._lhs,
- rhs=self._rhs,
- dot=self._dot + 1,
- )
- # Accessors
- def lhs(self):
- return self._lhs
- def span(self):
- return self._span
- def start(self):
- return self._span[0]
- def end(self):
- return self._span[1]
- def length(self):
- return self._span[1] - self._span[0]
- def rhs(self):
- return self._rhs
- def dot(self):
- return self._dot
- def is_complete(self):
- return self._dot == len(self._rhs)
- def is_incomplete(self):
- return self._dot != len(self._rhs)
- def nextsym(self):
- if self._dot >= len(self._rhs):
- return None
- else:
- return self._rhs[self._dot]
- # String representation
- def __str__(self):
- str = '[%s:%s] ' % (self._span[0], self._span[1])
- str += '%-2r ->' % (self._lhs,)
- for i in range(len(self._rhs)):
- if i == self._dot:
- str += ' *'
- str += ' %s' % unicode_repr(self._rhs[i])
- if len(self._rhs) == self._dot:
- str += ' *'
- return str
- def __repr__(self):
- return '[Edge: %s]' % self
- @python_2_unicode_compatible
- class LeafEdge(EdgeI):
- """
- An edge that records the fact that a leaf value is consistent with
- a word in the sentence. A leaf edge consists of:
- - An index, indicating the position of the word.
- - A leaf, specifying the word's content.
- A leaf edge's left-hand side is its leaf value, and its right hand
- side is ``()``. Its span is ``[index, index+1]``, and its dot
- position is ``0``.
- """
- def __init__(self, leaf, index):
- """
- Construct a new ``LeafEdge``.
- :param leaf: The new edge's leaf value, specifying the word
- that is recorded by this edge.
- :param index: The new edge's index, specifying the position of
- the word that is recorded by this edge.
- """
- self._leaf = leaf
- self._index = index
- self._comparison_key = (leaf, index)
- # Accessors
- def lhs(self):
- return self._leaf
- def span(self):
- return (self._index, self._index + 1)
- def start(self):
- return self._index
- def end(self):
- return self._index + 1
- def length(self):
- return 1
- def rhs(self):
- return ()
- def dot(self):
- return 0
- def is_complete(self):
- return True
- def is_incomplete(self):
- return False
- def nextsym(self):
- return None
- # String representations
- def __str__(self):
- return '[%s:%s] %s' % (self._index, self._index + 1, unicode_repr(self._leaf))
- def __repr__(self):
- return '[Edge: %s]' % (self)
- ########################################################################
- ## Chart
- ########################################################################
- class Chart(object):
- """
- A blackboard for hypotheses about the syntactic constituents of a
- sentence. A chart contains a set of edges, and each edge encodes
- a single hypothesis about the structure of some portion of the
- sentence.
- The ``select`` method can be used to select a specific collection
- of edges. For example ``chart.select(is_complete=True, start=0)``
- yields all complete edges whose start indices are 0. To ensure
- the efficiency of these selection operations, ``Chart`` dynamically
- creates and maintains an index for each set of attributes that
- have been selected on.
- In order to reconstruct the trees that are represented by an edge,
- the chart associates each edge with a set of child pointer lists.
- A child pointer list is a list of the edges that license an
- edge's right-hand side.
- :ivar _tokens: The sentence that the chart covers.
- :ivar _num_leaves: The number of tokens.
- :ivar _edges: A list of the edges in the chart
- :ivar _edge_to_cpls: A dictionary mapping each edge to a set
- of child pointer lists that are associated with that edge.
- :ivar _indexes: A dictionary mapping tuples of edge attributes
- to indices, where each index maps the corresponding edge
- attribute values to lists of edges.
- """
- def __init__(self, tokens):
- """
- Construct a new chart. The chart is initialized with the
- leaf edges corresponding to the terminal leaves.
- :type tokens: list
- :param tokens: The sentence that this chart will be used to parse.
- """
- # Record the sentence token and the sentence length.
- self._tokens = tuple(tokens)
- self._num_leaves = len(self._tokens)
- # Initialise the chart.
- self.initialize()
- def initialize(self):
- """
- Clear the chart.
- """
- # A list of edges contained in this chart.
- self._edges = []
- # The set of child pointer lists associated with each edge.
- self._edge_to_cpls = {}
- # Indexes mapping attribute values to lists of edges
- # (used by select()).
- self._indexes = {}
- # ////////////////////////////////////////////////////////////
- # Sentence Access
- # ////////////////////////////////////////////////////////////
- def num_leaves(self):
- """
- Return the number of words in this chart's sentence.
- :rtype: int
- """
- return self._num_leaves
- def leaf(self, index):
- """
- Return the leaf value of the word at the given index.
- :rtype: str
- """
- return self._tokens[index]
- def leaves(self):
- """
- Return a list of the leaf values of each word in the
- chart's sentence.
- :rtype: list(str)
- """
- return self._tokens
- # ////////////////////////////////////////////////////////////
- # Edge access
- # ////////////////////////////////////////////////////////////
- def edges(self):
- """
- Return a list of all edges in this chart. New edges
- that are added to the chart after the call to edges()
- will *not* be contained in this list.
- :rtype: list(EdgeI)
- :see: ``iteredges``, ``select``
- """
- return self._edges[:]
- def iteredges(self):
- """
- Return an iterator over the edges in this chart. It is
- not guaranteed that new edges which are added to the
- chart before the iterator is exhausted will also be generated.
- :rtype: iter(EdgeI)
- :see: ``edges``, ``select``
- """
- return iter(self._edges)
- # Iterating over the chart yields its edges.
- __iter__ = iteredges
- def num_edges(self):
- """
- Return the number of edges contained in this chart.
- :rtype: int
- """
- return len(self._edge_to_cpls)
- def select(self, **restrictions):
- """
- Return an iterator over the edges in this chart. Any
- new edges that are added to the chart before the iterator
- is exahusted will also be generated. ``restrictions``
- can be used to restrict the set of edges that will be
- generated.
- :param span: Only generate edges ``e`` where ``e.span()==span``
- :param start: Only generate edges ``e`` where ``e.start()==start``
- :param end: Only generate edges ``e`` where ``e.end()==end``
- :param length: Only generate edges ``e`` where ``e.length()==length``
- :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs``
- :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs``
- :param nextsym: Only generate edges ``e`` where
- ``e.nextsym()==nextsym``
- :param dot: Only generate edges ``e`` where ``e.dot()==dot``
- :param is_complete: Only generate edges ``e`` where
- ``e.is_complete()==is_complete``
- :param is_incomplete: Only generate edges ``e`` where
- ``e.is_incomplete()==is_incomplete``
- :rtype: iter(EdgeI)
- """
- # If there are no restrictions, then return all edges.
- if restrictions == {}:
- return iter(self._edges)
- # Find the index corresponding to the given restrictions.
- restr_keys = sorted(restrictions.keys())
- restr_keys = tuple(restr_keys)
- # If it doesn't exist, then create it.
- if restr_keys not in self._indexes:
- self._add_index(restr_keys)
- vals = tuple(restrictions[key] for key in restr_keys)
- return iter(self._indexes[restr_keys].get(vals, []))
- def _add_index(self, restr_keys):
- """
- A helper function for ``select``, which creates a new index for
- a given set of attributes (aka restriction keys).
- """
- # Make sure it's a valid index.
- for key in restr_keys:
- if not hasattr(EdgeI, key):
- raise ValueError('Bad restriction: %s' % key)
- # Create the index.
- index = self._indexes[restr_keys] = {}
- # Add all existing edges to the index.
- for edge in self._edges:
- vals = tuple(getattr(edge, key)() for key in restr_keys)
- index.setdefault(vals, []).append(edge)
- def _register_with_indexes(self, edge):
- """
- A helper function for ``insert``, which registers the new
- edge with all existing indexes.
- """
- for (restr_keys, index) in self._indexes.items():
- vals = tuple(getattr(edge, key)() for key in restr_keys)
- index.setdefault(vals, []).append(edge)
- # ////////////////////////////////////////////////////////////
- # Edge Insertion
- # ////////////////////////////////////////////////////////////
- def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
- """
- Add a new edge to the chart, using a pointer to the previous edge.
- """
- cpls = self.child_pointer_lists(previous_edge)
- new_cpls = [cpl + (child_edge,) for cpl in cpls]
- return self.insert(new_edge, *new_cpls)
- def insert(self, edge, *child_pointer_lists):
- """
- Add a new edge to the chart, and return True if this operation
- modified the chart. In particular, return true iff the chart
- did not already contain ``edge``, or if it did not already associate
- ``child_pointer_lists`` with ``edge``.
- :type edge: EdgeI
- :param edge: The new edge
- :type child_pointer_lists: sequence of tuple(EdgeI)
- :param child_pointer_lists: A sequence of lists of the edges that
- were used to form this edge. This list is used to reconstruct
- the trees (or partial trees) that are associated with ``edge``.
- :rtype: bool
- """
- # Is it a new edge?
- if edge not in self._edge_to_cpls:
- # Add it to the list of edges.
- self._append_edge(edge)
- # Register with indexes.
- self._register_with_indexes(edge)
- # Get the set of child pointer lists for this edge.
- cpls = self._edge_to_cpls.setdefault(edge, OrderedDict())
- chart_was_modified = False
- for child_pointer_list in child_pointer_lists:
- child_pointer_list = tuple(child_pointer_list)
- if child_pointer_list not in cpls:
- # It's a new CPL; register it, and return true.
- cpls[child_pointer_list] = True
- chart_was_modified = True
- return chart_was_modified
- def _append_edge(self, edge):
- self._edges.append(edge)
- # ////////////////////////////////////////////////////////////
- # Tree extraction & child pointer lists
- # ////////////////////////////////////////////////////////////
- def parses(self, root, tree_class=Tree):
- """
- Return an iterator of the complete tree structures that span
- the entire chart, and whose root node is ``root``.
- """
- for edge in self.select(start=0, end=self._num_leaves, lhs=root):
- for tree in self.trees(edge, tree_class=tree_class, complete=True):
- yield tree
- def trees(self, edge, tree_class=Tree, complete=False):
- """
- Return an iterator of the tree structures that are associated
- with ``edge``.
- If ``edge`` is incomplete, then the unexpanded children will be
- encoded as childless subtrees, whose node value is the
- corresponding terminal or nonterminal.
- :rtype: list(Tree)
- :note: If two trees share a common subtree, then the same
- Tree may be used to encode that subtree in
- both trees. If you need to eliminate this subtree
- sharing, then create a deep copy of each tree.
- """
- return iter(self._trees(edge, complete, memo={}, tree_class=tree_class))
- def _trees(self, edge, complete, memo, tree_class):
- """
- A helper function for ``trees``.
- :param memo: A dictionary used to record the trees that we've
- generated for each edge, so that when we see an edge more
- than once, we can reuse the same trees.
- """
- # If we've seen this edge before, then reuse our old answer.
- if edge in memo:
- return memo[edge]
- # when we're reading trees off the chart, don't use incomplete edges
- if complete and edge.is_incomplete():
- return []
- # Leaf edges.
- if isinstance(edge, LeafEdge):
- leaf = self._tokens[edge.start()]
- memo[edge] = [leaf]
- return [leaf]
- # Until we're done computing the trees for edge, set
- # memo[edge] to be empty. This has the effect of filtering
- # out any cyclic trees (i.e., trees that contain themselves as
- # descendants), because if we reach this edge via a cycle,
- # then it will appear that the edge doesn't generate any trees.
- memo[edge] = []
- trees = []
- lhs = edge.lhs().symbol()
- # Each child pointer list can be used to form trees.
- for cpl in self.child_pointer_lists(edge):
- # Get the set of child choices for each child pointer.
- # child_choices[i] is the set of choices for the tree's
- # ith child.
- child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
- # For each combination of children, add a tree.
- for children in itertools.product(*child_choices):
- trees.append(tree_class(lhs, children))
- # If the edge is incomplete, then extend it with "partial trees":
- if edge.is_incomplete():
- unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]]
- for tree in trees:
- tree.extend(unexpanded)
- # Update the memoization dictionary.
- memo[edge] = trees
- # Return the list of trees.
- return trees
- def child_pointer_lists(self, edge):
- """
- Return the set of child pointer lists for the given edge.
- Each child pointer list is a list of edges that have
- been used to form this edge.
- :rtype: list(list(EdgeI))
- """
- # Make a copy, in case they modify it.
- return self._edge_to_cpls.get(edge, {}).keys()
- # ////////////////////////////////////////////////////////////
- # Display
- # ////////////////////////////////////////////////////////////
- def pretty_format_edge(self, edge, width=None):
- """
- Return a pretty-printed string representation of a given edge
- in this chart.
- :rtype: str
- :param width: The number of characters allotted to each
- index in the sentence.
- """
- if width is None:
- width = 50 // (self.num_leaves() + 1)
- (start, end) = (edge.start(), edge.end())
- str = '|' + ('.' + ' ' * (width - 1)) * start
- # Zero-width edges are "#" if complete, ">" if incomplete
- if start == end:
- if edge.is_complete():
- str += '#'
- else:
- str += '>'
- # Spanning complete edges are "[===]"; Other edges are
- # "[---]" if complete, "[--->" if incomplete
- elif edge.is_complete() and edge.span() == (0, self._num_leaves):
- str += '[' + ('=' * width) * (end - start - 1) + '=' * (width - 1) + ']'
- elif edge.is_complete():
- str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + ']'
- else:
- str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + '>'
- str += (' ' * (width - 1) + '.') * (self._num_leaves - end)
- return str + '| %s' % edge
- def pretty_format_leaves(self, width=None):
- """
- Return a pretty-printed string representation of this
- chart's leaves. This string can be used as a header
- for calls to ``pretty_format_edge``.
- """
- if width is None:
- width = 50 // (self.num_leaves() + 1)
- if self._tokens is not None and width > 1:
- header = '|.'
- for tok in self._tokens:
- header += tok[: width - 1].center(width - 1) + '.'
- header += '|'
- else:
- header = ''
- return header
- def pretty_format(self, width=None):
- """
- Return a pretty-printed string representation of this chart.
- :param width: The number of characters allotted to each
- index in the sentence.
- :rtype: str
- """
- if width is None:
- width = 50 // (self.num_leaves() + 1)
- # sort edges: primary key=length, secondary key=start index.
- # (and filter out the token edges)
- edges = sorted([(e.length(), e.start(), e) for e in self])
- edges = [e for (_, _, e) in edges]
- return (
- self.pretty_format_leaves(width)
- + '\n'
- + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges)
- )
- # ////////////////////////////////////////////////////////////
- # Display: Dot (AT&T Graphviz)
- # ////////////////////////////////////////////////////////////
- def dot_digraph(self):
- # Header
- s = 'digraph nltk_chart {\n'
- # s += ' size="5,5";\n'
- s += ' rankdir=LR;\n'
- s += ' node [height=0.1,width=0.1];\n'
- s += ' node [style=filled, color="lightgray"];\n'
- # Set up the nodes
- for y in range(self.num_edges(), -1, -1):
- if y == 0:
- s += ' node [style=filled, color="black"];\n'
- for x in range(self.num_leaves() + 1):
- if y == 0 or (
- x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
- ):
- s += ' %04d.%04d [label=""];\n' % (x, y)
- # Add a spacer
- s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
- # Declare ranks.
- for x in range(self.num_leaves() + 1):
- s += ' {rank=same;'
- for y in range(self.num_edges() + 1):
- if y == 0 or (
- x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
- ):
- s += ' %04d.%04d' % (x, y)
- s += '}\n'
- # Add the leaves
- s += ' edge [style=invis, weight=100];\n'
- s += ' node [shape=plaintext]\n'
- s += ' 0000.0000'
- for x in range(self.num_leaves()):
- s += '->%s->%04d.0000' % (self.leaf(x), x + 1)
- s += ';\n\n'
- # Add the edges
- s += ' edge [style=solid, weight=1];\n'
- for y, edge in enumerate(self):
- for x in range(edge.start()):
- s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
- x,
- y + 1,
- x + 1,
- y + 1,
- )
- s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % (
- edge.start(),
- y + 1,
- edge.end(),
- y + 1,
- edge,
- )
- for x in range(edge.end(), self.num_leaves()):
- s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
- x,
- y + 1,
- x + 1,
- y + 1,
- )
- s += '}\n'
- return s
- ########################################################################
- ## Chart Rules
- ########################################################################
- class ChartRuleI(object):
- """
- A rule that specifies what new edges are licensed by any given set
- of existing edges. Each chart rule expects a fixed number of
- edges, as indicated by the class variable ``NUM_EDGES``. In
- particular:
- - A chart rule with ``NUM_EDGES=0`` specifies what new edges are
- licensed, regardless of existing edges.
- - A chart rule with ``NUM_EDGES=1`` specifies what new edges are
- licensed by a single existing edge.
- - A chart rule with ``NUM_EDGES=2`` specifies what new edges are
- licensed by a pair of existing edges.
- :type NUM_EDGES: int
- :cvar NUM_EDGES: The number of existing edges that this rule uses
- to license new edges. Typically, this number ranges from zero
- to two.
- """
- def apply(self, chart, grammar, *edges):
- """
- Return a generator that will add edges licensed by this rule
- and the given edges to the chart, one at a time. Each
- time the generator is resumed, it will either add a new
- edge and yield that edge; or return.
- :type edges: list(EdgeI)
- :param edges: A set of existing edges. The number of edges
- that should be passed to ``apply()`` is specified by the
- ``NUM_EDGES`` class variable.
- :rtype: iter(EdgeI)
- """
- raise NotImplementedError()
- def apply_everywhere(self, chart, grammar):
- """
- Return a generator that will add all edges licensed by
- this rule, given the edges that are currently in the
- chart, one at a time. Each time the generator is resumed,
- it will either add a new edge and yield that edge; or return.
- :rtype: iter(EdgeI)
- """
- raise NotImplementedError()
- @python_2_unicode_compatible
- class AbstractChartRule(ChartRuleI):
- """
- An abstract base class for chart rules. ``AbstractChartRule``
- provides:
- - A default implementation for ``apply``.
- - A default implementation for ``apply_everywhere``,
- (Currently, this implementation assumes that ``NUM_EDGES``<=3.)
- - A default implementation for ``__str__``, which returns a
- name based on the rule's class name.
- """
- # Subclasses must define apply.
- def apply(self, chart, grammar, *edges):
- raise NotImplementedError()
- # Default: loop through the given number of edges, and call
- # self.apply() for each set of edges.
- def apply_everywhere(self, chart, grammar):
- if self.NUM_EDGES == 0:
- for new_edge in self.apply(chart, grammar):
- yield new_edge
- elif self.NUM_EDGES == 1:
- for e1 in chart:
- for new_edge in self.apply(chart, grammar, e1):
- yield new_edge
- elif self.NUM_EDGES == 2:
- for e1 in chart:
- for e2 in chart:
- for new_edge in self.apply(chart, grammar, e1, e2):
- yield new_edge
- elif self.NUM_EDGES == 3:
- for e1 in chart:
- for e2 in chart:
- for e3 in chart:
- for new_edge in self.apply(chart, grammar, e1, e2, e3):
- yield new_edge
- else:
- raise AssertionError('NUM_EDGES>3 is not currently supported')
- # Default: return a name based on the class name.
- def __str__(self):
- # Add spaces between InitialCapsWords.
- return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
- # ////////////////////////////////////////////////////////////
- # Fundamental Rule
- # ////////////////////////////////////////////////////////////
- class FundamentalRule(AbstractChartRule):
- """
- A rule that joins two adjacent edges to form a single combined
- edge. In particular, this rule specifies that any pair of edges
- - ``[A -> alpha \* B beta][i:j]``
- - ``[B -> gamma \*][j:k]``
- licenses the edge:
- - ``[A -> alpha B * beta][i:j]``
- """
- NUM_EDGES = 2
- def apply(self, chart, grammar, left_edge, right_edge):
- # Make sure the rule is applicable.
- if not (
- left_edge.is_incomplete()
- and right_edge.is_complete()
- and left_edge.end() == right_edge.start()
- and left_edge.nextsym() == right_edge.lhs()
- ):
- return
- # Construct the new edge.
- new_edge = left_edge.move_dot_forward(right_edge.end())
- # Insert it into the chart.
- if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
- yield new_edge
- class SingleEdgeFundamentalRule(FundamentalRule):
- """
- A rule that joins a given edge with adjacent edges in the chart,
- to form combined edges. In particular, this rule specifies that
- either of the edges:
- - ``[A -> alpha \* B beta][i:j]``
- - ``[B -> gamma \*][j:k]``
- licenses the edge:
- - ``[A -> alpha B * beta][i:j]``
- if the other edge is already in the chart.
- :note: This is basically ``FundamentalRule``, with one edge left
- unspecified.
- """
- NUM_EDGES = 1
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- for new_edge in self._apply_incomplete(chart, grammar, edge):
- yield new_edge
- else:
- for new_edge in self._apply_complete(chart, grammar, edge):
- yield new_edge
- def _apply_complete(self, chart, grammar, right_edge):
- for left_edge in chart.select(
- end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
- ):
- new_edge = left_edge.move_dot_forward(right_edge.end())
- if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
- yield new_edge
- def _apply_incomplete(self, chart, grammar, left_edge):
- for right_edge in chart.select(
- start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
- ):
- new_edge = left_edge.move_dot_forward(right_edge.end())
- if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
- yield new_edge
- # ////////////////////////////////////////////////////////////
- # Inserting Terminal Leafs
- # ////////////////////////////////////////////////////////////
- class LeafInitRule(AbstractChartRule):
- NUM_EDGES = 0
- def apply(self, chart, grammar):
- for index in range(chart.num_leaves()):
- new_edge = LeafEdge(chart.leaf(index), index)
- if chart.insert(new_edge, ()):
- yield new_edge
- # ////////////////////////////////////////////////////////////
- # Top-Down Prediction
- # ////////////////////////////////////////////////////////////
- class TopDownInitRule(AbstractChartRule):
- """
- A rule licensing edges corresponding to the grammar productions for
- the grammar's start symbol. In particular, this rule specifies that
- ``[S -> \* alpha][0:i]`` is licensed for each grammar production
- ``S -> alpha``, where ``S`` is the grammar's start symbol.
- """
- NUM_EDGES = 0
- def apply(self, chart, grammar):
- for prod in grammar.productions(lhs=grammar.start()):
- new_edge = TreeEdge.from_production(prod, 0)
- if chart.insert(new_edge, ()):
- yield new_edge
- class TopDownPredictRule(AbstractChartRule):
- """
- A rule licensing edges corresponding to the grammar productions
- for the nonterminal following an incomplete edge's dot. In
- particular, this rule specifies that
- ``[A -> alpha \* B beta][i:j]`` licenses the edge
- ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``.
- :note: This rule corresponds to the Predictor Rule in Earley parsing.
- """
- NUM_EDGES = 1
- def apply(self, chart, grammar, edge):
- if edge.is_complete():
- return
- for prod in grammar.productions(lhs=edge.nextsym()):
- new_edge = TreeEdge.from_production(prod, edge.end())
- if chart.insert(new_edge, ()):
- yield new_edge
- class CachedTopDownPredictRule(TopDownPredictRule):
- """
- A cached version of ``TopDownPredictRule``. After the first time
- this rule is applied to an edge with a given ``end`` and ``next``,
- it will not generate any more edges for edges with that ``end`` and
- ``next``.
- If ``chart`` or ``grammar`` are changed, then the cache is flushed.
- """
- def __init__(self):
- TopDownPredictRule.__init__(self)
- self._done = {}
- def apply(self, chart, grammar, edge):
- if edge.is_complete():
- return
- nextsym, index = edge.nextsym(), edge.end()
- if not is_nonterminal(nextsym):
- return
- # If we've already applied this rule to an edge with the same
- # next & end, and the chart & grammar have not changed, then
- # just return (no new edges to add).
- done = self._done.get((nextsym, index), (None, None))
- if done[0] is chart and done[1] is grammar:
- return
- # Add all the edges indicated by the top down expand rule.
- for prod in grammar.productions(lhs=nextsym):
- # If the left corner in the predicted production is
- # leaf, it must match with the input.
- if prod.rhs():
- first = prod.rhs()[0]
- if is_terminal(first):
- if index >= chart.num_leaves() or first != chart.leaf(index):
- continue
- new_edge = TreeEdge.from_production(prod, index)
- if chart.insert(new_edge, ()):
- yield new_edge
- # Record the fact that we've applied this rule.
- self._done[nextsym, index] = (chart, grammar)
- # ////////////////////////////////////////////////////////////
- # Bottom-Up Prediction
- # ////////////////////////////////////////////////////////////
- class BottomUpPredictRule(AbstractChartRule):
- """
- A rule licensing any edge corresponding to a production whose
- right-hand side begins with a complete edge's left-hand side. In
- particular, this rule specifies that ``[A -> alpha \*]`` licenses
- the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
- """
- NUM_EDGES = 1
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
- for prod in grammar.productions(rhs=edge.lhs()):
- new_edge = TreeEdge.from_production(prod, edge.start())
- if chart.insert(new_edge, ()):
- yield new_edge
- class BottomUpPredictCombineRule(BottomUpPredictRule):
- """
- A rule licensing any edge corresponding to a production whose
- right-hand side begins with a complete edge's left-hand side. In
- particular, this rule specifies that ``[A -> alpha \*]``
- licenses the edge ``[B -> A \* beta]`` for each grammar
- production ``B -> A beta``.
- :note: This is like ``BottomUpPredictRule``, but it also applies
- the ``FundamentalRule`` to the resulting edge.
- """
- NUM_EDGES = 1
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
- for prod in grammar.productions(rhs=edge.lhs()):
- new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
- if chart.insert(new_edge, (edge,)):
- yield new_edge
- class EmptyPredictRule(AbstractChartRule):
- """
- A rule that inserts all empty productions as passive edges,
- in every position in the chart.
- """
- NUM_EDGES = 0
- def apply(self, chart, grammar):
- for prod in grammar.productions(empty=True):
- for index in range(chart.num_leaves() + 1):
- new_edge = TreeEdge.from_production(prod, index)
- if chart.insert(new_edge, ()):
- yield new_edge
- ########################################################################
- ## Filtered Bottom Up
- ########################################################################
- class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
- def _apply_complete(self, chart, grammar, right_edge):
- end = right_edge.end()
- nexttoken = end < chart.num_leaves() and chart.leaf(end)
- for left_edge in chart.select(
- end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
- ):
- if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
- new_edge = left_edge.move_dot_forward(right_edge.end())
- if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
- yield new_edge
- def _apply_incomplete(self, chart, grammar, left_edge):
- for right_edge in chart.select(
- start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
- ):
- end = right_edge.end()
- nexttoken = end < chart.num_leaves() and chart.leaf(end)
- if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
- new_edge = left_edge.move_dot_forward(right_edge.end())
- if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
- yield new_edge
- class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
- def apply(self, chart, grammar, edge):
- if edge.is_incomplete():
- return
- end = edge.end()
- nexttoken = end < chart.num_leaves() and chart.leaf(end)
- for prod in grammar.productions(rhs=edge.lhs()):
- if _bottomup_filter(grammar, nexttoken, prod.rhs()):
- new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
- if chart.insert(new_edge, (edge,)):
- yield new_edge
- def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
- if len(rhs) <= dot + 1:
- return True
- _next = rhs[dot + 1]
- if is_terminal(_next):
- return nexttoken == _next
- else:
- return grammar.is_leftcorner(_next, nexttoken)
- ########################################################################
- ## Generic Chart Parser
- ########################################################################
- TD_STRATEGY = [
- LeafInitRule(),
- TopDownInitRule(),
- CachedTopDownPredictRule(),
- SingleEdgeFundamentalRule(),
- ]
- BU_STRATEGY = [
- LeafInitRule(),
- EmptyPredictRule(),
- BottomUpPredictRule(),
- SingleEdgeFundamentalRule(),
- ]
- BU_LC_STRATEGY = [
- LeafInitRule(),
- EmptyPredictRule(),
- BottomUpPredictCombineRule(),
- SingleEdgeFundamentalRule(),
- ]
- LC_STRATEGY = [
- LeafInitRule(),
- FilteredBottomUpPredictCombineRule(),
- FilteredSingleEdgeFundamentalRule(),
- ]
- class ChartParser(ParserI):
- """
- A generic chart parser. A "strategy", or list of
- ``ChartRuleI`` instances, is used to decide what edges to add to
- the chart. In particular, ``ChartParser`` uses the following
- algorithm to parse texts:
- | Until no new edges are added:
- | For each *rule* in *strategy*:
- | Apply *rule* to any applicable edges in the chart.
- | Return any complete parses in the chart
- """
- def __init__(
- self,
- grammar,
- strategy=BU_LC_STRATEGY,
- trace=0,
- trace_chart_width=50,
- use_agenda=True,
- chart_class=Chart,
- ):
- """
- Create a new chart parser, that uses ``grammar`` to parse
- texts.
- :type grammar: CFG
- :param grammar: The grammar used to parse texts.
- :type strategy: list(ChartRuleI)
- :param strategy: A list of rules that should be used to decide
- what edges to add to the chart (top-down strategy by default).
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- and higher numbers will produce more verbose tracing
- output.
- :type trace_chart_width: int
- :param trace_chart_width: The default total width reserved for
- the chart in trace output. The remainder of each line will
- be used to display edges.
- :type use_agenda: bool
- :param use_agenda: Use an optimized agenda-based algorithm,
- if possible.
- :param chart_class: The class that should be used to create
- the parse charts.
- """
- self._grammar = grammar
- self._strategy = strategy
- self._trace = trace
- self._trace_chart_width = trace_chart_width
- # If the strategy only consists of axioms (NUM_EDGES==0) and
- # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm:
- self._use_agenda = use_agenda
- self._chart_class = chart_class
- self._axioms = []
- self._inference_rules = []
- for rule in strategy:
- if rule.NUM_EDGES == 0:
- self._axioms.append(rule)
- elif rule.NUM_EDGES == 1:
- self._inference_rules.append(rule)
- else:
- self._use_agenda = False
- def grammar(self):
- return self._grammar
- def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
- if not trace:
- return
- print_rule_header = trace > 1
- for edge in new_edges:
- if print_rule_header:
- print('%s:' % rule)
- print_rule_header = False
- print(chart.pretty_format_edge(edge, edge_width))
- def chart_parse(self, tokens, trace=None):
- """
- Return the final parse ``Chart`` from which all possible
- parse trees can be extracted.
- :param tokens: The sentence to be parsed
- :type tokens: list(str)
- :rtype: Chart
- """
- if trace is None:
- trace = self._trace
- trace_new_edges = self._trace_new_edges
- tokens = list(tokens)
- self._grammar.check_coverage(tokens)
- chart = self._chart_class(tokens)
- grammar = self._grammar
- # Width, for printing trace edges.
- trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
- if trace:
- print(chart.pretty_format_leaves(trace_edge_width))
- if self._use_agenda:
- # Use an agenda-based algorithm.
- for axiom in self._axioms:
- new_edges = list(axiom.apply(chart, grammar))
- trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
- inference_rules = self._inference_rules
- agenda = chart.edges()
- # We reverse the initial agenda, since it is a stack
- # but chart.edges() functions as a queue.
- agenda.reverse()
- while agenda:
- edge = agenda.pop()
- for rule in inference_rules:
- new_edges = list(rule.apply(chart, grammar, edge))
- if trace:
- trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
- agenda += new_edges
- else:
- # Do not use an agenda-based algorithm.
- edges_added = True
- while edges_added:
- edges_added = False
- for rule in self._strategy:
- new_edges = list(rule.apply_everywhere(chart, grammar))
- edges_added = len(new_edges)
- trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
- # Return the final chart.
- return chart
- def parse(self, tokens, tree_class=Tree):
- chart = self.chart_parse(tokens)
- return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
- class TopDownChartParser(ChartParser):
- """
- A ``ChartParser`` using a top-down parsing strategy.
- See ``ChartParser`` for more information.
- """
- def __init__(self, grammar, **parser_args):
- ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
- class BottomUpChartParser(ChartParser):
- """
- A ``ChartParser`` using a bottom-up parsing strategy.
- See ``ChartParser`` for more information.
- """
- def __init__(self, grammar, **parser_args):
- if isinstance(grammar, PCFG):
- warnings.warn(
- "BottomUpChartParser only works for CFG, "
- "use BottomUpProbabilisticChartParser instead",
- category=DeprecationWarning,
- )
- ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
- class BottomUpLeftCornerChartParser(ChartParser):
- """
- A ``ChartParser`` using a bottom-up left-corner parsing strategy.
- This strategy is often more efficient than standard bottom-up.
- See ``ChartParser`` for more information.
- """
- def __init__(self, grammar, **parser_args):
- ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
- class LeftCornerChartParser(ChartParser):
- def __init__(self, grammar, **parser_args):
- if not grammar.is_nonempty():
- raise ValueError(
- "LeftCornerParser only works for grammars " "without empty productions."
- )
- ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
- ########################################################################
- ## Stepping Chart Parser
- ########################################################################
- class SteppingChartParser(ChartParser):
- """
- A ``ChartParser`` that allows you to step through the parsing
- process, adding a single edge at a time. It also allows you to
- change the parser's strategy or grammar midway through parsing a
- text.
- The ``initialize`` method is used to start parsing a text. ``step``
- adds a single edge to the chart. ``set_strategy`` changes the
- strategy used by the chart parser. ``parses`` returns the set of
- parses that has been found by the chart parser.
- :ivar _restart: Records whether the parser's strategy, grammar,
- or chart has been changed. If so, then ``step`` must restart
- the parsing algorithm.
- """
- def __init__(self, grammar, strategy=[], trace=0):
- self._chart = None
- self._current_chartrule = None
- self._restart = False
- ChartParser.__init__(self, grammar, strategy, trace)
- # ////////////////////////////////////////////////////////////
- # Initialization
- # ////////////////////////////////////////////////////////////
- def initialize(self, tokens):
- "Begin parsing the given tokens."
- self._chart = Chart(list(tokens))
- self._restart = True
- # ////////////////////////////////////////////////////////////
- # Stepping
- # ////////////////////////////////////////////////////////////
- def step(self):
- """
- Return a generator that adds edges to the chart, one at a
- time. Each time the generator is resumed, it adds a single
- edge and yields that edge. If no more edges can be added,
- then it yields None.
- If the parser's strategy, grammar, or chart is changed, then
- the generator will continue adding edges using the new
- strategy, grammar, or chart.
- Note that this generator never terminates, since the grammar
- or strategy might be changed to values that would add new
- edges. Instead, it yields None when no more edges can be
- added with the current strategy and grammar.
- """
- if self._chart is None:
- raise ValueError('Parser must be initialized first')
- while True:
- self._restart = False
- w = 50 // (self._chart.num_leaves() + 1)
- for e in self._parse():
- if self._trace > 1:
- print(self._current_chartrule)
- if self._trace > 0:
- print(self._chart.pretty_format_edge(e, w))
- yield e
- if self._restart:
- break
- else:
- yield None # No more edges.
- def _parse(self):
- """
- A generator that implements the actual parsing algorithm.
- ``step`` iterates through this generator, and restarts it
- whenever the parser's strategy, grammar, or chart is modified.
- """
- chart = self._chart
- grammar = self._grammar
- edges_added = 1
- while edges_added > 0:
- edges_added = 0
- for rule in self._strategy:
- self._current_chartrule = rule
- for e in rule.apply_everywhere(chart, grammar):
- edges_added += 1
- yield e
- # ////////////////////////////////////////////////////////////
- # Accessors
- # ////////////////////////////////////////////////////////////
- def strategy(self):
- "Return the strategy used by this parser."
- return self._strategy
- def grammar(self):
- "Return the grammar used by this parser."
- return self._grammar
- def chart(self):
- "Return the chart that is used by this parser."
- return self._chart
- def current_chartrule(self):
- "Return the chart rule used to generate the most recent edge."
- return self._current_chartrule
- def parses(self, tree_class=Tree):
- "Return the parse trees currently contained in the chart."
- return self._chart.parses(self._grammar.start(), tree_class)
- # ////////////////////////////////////////////////////////////
- # Parser modification
- # ////////////////////////////////////////////////////////////
- def set_strategy(self, strategy):
- """
- Change the strategy that the parser uses to decide which edges
- to add to the chart.
- :type strategy: list(ChartRuleI)
- :param strategy: A list of rules that should be used to decide
- what edges to add to the chart.
- """
- if strategy == self._strategy:
- return
- self._strategy = strategy[:] # Make a copy.
- self._restart = True
- def set_grammar(self, grammar):
- "Change the grammar used by the parser."
- if grammar is self._grammar:
- return
- self._grammar = grammar
- self._restart = True
- def set_chart(self, chart):
- "Load a given chart into the chart parser."
- if chart is self._chart:
- return
- self._chart = chart
- self._restart = True
- # ////////////////////////////////////////////////////////////
- # Standard parser methods
- # ////////////////////////////////////////////////////////////
- def parse(self, tokens, tree_class=Tree):
- tokens = list(tokens)
- self._grammar.check_coverage(tokens)
- # Initialize ourselves.
- self.initialize(tokens)
- # Step until no more edges are generated.
- for e in self.step():
- if e is None:
- break
- # Return an iterator of complete parses.
- return self.parses(tree_class=tree_class)
- ########################################################################
- ## Demo Code
- ########################################################################
- def demo_grammar():
- from nltk.grammar import CFG
- return CFG.fromstring(
- """
- S -> NP VP
- PP -> "with" NP
- NP -> NP PP
- VP -> VP PP
- VP -> Verb NP
- VP -> Verb
- NP -> Det Noun
- NP -> "John"
- NP -> "I"
- Det -> "the"
- Det -> "my"
- Det -> "a"
- Noun -> "dog"
- Noun -> "cookie"
- Verb -> "ate"
- Verb -> "saw"
- Prep -> "with"
- Prep -> "under"
- """
- )
- def demo(
- choice=None,
- print_times=True,
- print_grammar=False,
- print_trees=True,
- trace=2,
- sent='I saw John with a dog with my cookie',
- numparses=5,
- ):
- """
- A demonstration of the chart parsers.
- """
- import sys, time
- from nltk import nonterminals, Production, CFG
- # The grammar for ChartParser and SteppingChartParser:
- grammar = demo_grammar()
- if print_grammar:
- print("* Grammar")
- print(grammar)
- # Tokenize the sample sentence.
- print("* Sentence:")
- print(sent)
- tokens = sent.split()
- print(tokens)
- print()
- # Ask the user which parser to test,
- # if the parser wasn't provided as an argument
- if choice is None:
- print(' 1: Top-down chart parser')
- print(' 2: Bottom-up chart parser')
- print(' 3: Bottom-up left-corner chart parser')
- print(' 4: Left-corner chart parser with bottom-up filter')
- print(' 5: Stepping chart parser (alternating top-down & bottom-up)')
- print(' 6: All parsers')
- print('\nWhich parser (1-6)? ', end=' ')
- choice = sys.stdin.readline().strip()
- print()
- choice = str(choice)
- if choice not in "123456":
- print('Bad parser number')
- return
- # Keep track of how long each parser takes.
- times = {}
- strategies = {
- '1': ('Top-down', TD_STRATEGY),
- '2': ('Bottom-up', BU_STRATEGY),
- '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
- '4': ('Filtered left-corner', LC_STRATEGY),
- }
- choices = []
- if choice in strategies:
- choices = [choice]
- if choice == '6':
- choices = "1234"
- # Run the requested chart parser(s), except the stepping parser.
- for strategy in choices:
- print("* Strategy: " + strategies[strategy][0])
- print()
- cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
- t = time.time()
- chart = cp.chart_parse(tokens)
- parses = list(chart.parses(grammar.start()))
- times[strategies[strategy][0]] = time.time() - t
- print("Nr edges in chart:", len(chart.edges()))
- if numparses:
- assert len(parses) == numparses, 'Not all parses found'
- if print_trees:
- for tree in parses:
- print(tree)
- else:
- print("Nr trees:", len(parses))
- print()
- # Run the stepping parser, if requested.
- if choice in "56":
- print("* Strategy: Stepping (top-down vs bottom-up)")
- print()
- t = time.time()
- cp = SteppingChartParser(grammar, trace=trace)
- cp.initialize(tokens)
- for i in range(5):
- print('*** SWITCH TO TOP DOWN')
- cp.set_strategy(TD_STRATEGY)
- for j, e in enumerate(cp.step()):
- if j > 20 or e is None:
- break
- print('*** SWITCH TO BOTTOM UP')
- cp.set_strategy(BU_STRATEGY)
- for j, e in enumerate(cp.step()):
- if j > 20 or e is None:
- break
- times['Stepping'] = time.time() - t
- print("Nr edges in chart:", len(cp.chart().edges()))
- if numparses:
- assert len(list(cp.parses())) == numparses, 'Not all parses found'
- if print_trees:
- for tree in cp.parses():
- print(tree)
- else:
- print("Nr trees:", len(list(cp.parses())))
- print()
- # Print the times of all parsers:
- if not (print_times and times):
- return
- print("* Parsing times")
- print()
- maxlen = max(len(key) for key in times)
- format = '%' + repr(maxlen) + 's parser: %6.3fsec'
- times_items = times.items()
- for (parser, t) in sorted(times_items, key=lambda a: a[1]):
- print(format % (parser, t))
- if __name__ == '__main__':
- demo()
|