1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Context Free Grammars
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # Edward Loper <edloper@gmail.com>
- # Jason Narad <jason.narad@gmail.com>
- # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- #
- """
- Basic data classes for representing context free grammars. A
- "grammar" specifies which trees can represent the structure of a
- given text. Each of these trees is called a "parse tree" for the
- text (or simply a "parse"). In a "context free" grammar, the set of
- parse trees for any piece of a text can depend only on that piece, and
- not on the rest of the text (i.e., the piece's context). Context free
- grammars are often used to find possible syntactic structures for
- sentences. In this context, the leaves of a parse tree are word
- tokens; and the node values are phrasal categories, such as ``NP``
- and ``VP``.
- The ``CFG`` class is used to encode context free grammars. Each
- ``CFG`` consists of a start symbol and a set of productions.
- The "start symbol" specifies the root node value for parse trees. For example,
- the start symbol for syntactic parsing is usually ``S``. Start
- symbols are encoded using the ``Nonterminal`` class, which is discussed
- below.
- A Grammar's "productions" specify what parent-child relationships a parse
- tree can contain. Each production specifies that a particular
- node can be the parent of a particular set of children. For example,
- the production ``<S> -> <NP> <VP>`` specifies that an ``S`` node can
- be the parent of an ``NP`` node and a ``VP`` node.
- Grammar productions are implemented by the ``Production`` class.
- Each ``Production`` consists of a left hand side and a right hand
- side. The "left hand side" is a ``Nonterminal`` that specifies the
- node type for a potential parent; and the "right hand side" is a list
- that specifies allowable children for that parent. This lists
- consists of ``Nonterminals`` and text types: each ``Nonterminal``
- indicates that the corresponding child may be a ``TreeToken`` with the
- specified node type; and each text type indicates that the
- corresponding child may be a ``Token`` with the with that type.
- The ``Nonterminal`` class is used to distinguish node values from leaf
- values. This prevents the grammar from accidentally using a leaf
- value (such as the English word "A") as the node of a subtree. Within
- a ``CFG``, all node values are wrapped in the ``Nonterminal``
- class. Note, however, that the trees that are specified by the grammar do
- *not* include these ``Nonterminal`` wrappers.
- Grammars can also be given a more procedural interpretation. According to
- this interpretation, a Grammar specifies any tree structure *tree* that
- can be produced by the following procedure:
- | Set tree to the start symbol
- | Repeat until tree contains no more nonterminal leaves:
- | Choose a production prod with whose left hand side
- | lhs is a nonterminal leaf of tree.
- | Replace the nonterminal leaf with a subtree, whose node
- | value is the value wrapped by the nonterminal lhs, and
- | whose children are the right hand side of prod.
- The operation of replacing the left hand side (*lhs*) of a production
- with the right hand side (*rhs*) in a tree (*tree*) is known as
- "expanding" *lhs* to *rhs* in *tree*.
- """
- from __future__ import print_function, unicode_literals, division
- import re
- from functools import total_ordering
- from six import string_types
- from nltk.util import transitive_closure, invert_graph
- from nltk.compat import python_2_unicode_compatible, unicode_repr
- from nltk.internals import raise_unorderable_types
- from nltk.probability import ImmutableProbabilisticMixIn
- from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE
- #################################################################
- # Nonterminal
- #################################################################
- @total_ordering
- @python_2_unicode_compatible
- class Nonterminal(object):
- """
- A non-terminal symbol for a context free grammar. ``Nonterminal``
- is a wrapper class for node values; it is used by ``Production``
- objects to distinguish node values from leaf values.
- The node value that is wrapped by a ``Nonterminal`` is known as its
- "symbol". Symbols are typically strings representing phrasal
- categories (such as ``"NP"`` or ``"VP"``). However, more complex
- symbol types are sometimes used (e.g., for lexicalized grammars).
- Since symbols are node values, they must be immutable and
- hashable. Two ``Nonterminals`` are considered equal if their
- symbols are equal.
- :see: ``CFG``, ``Production``
- :type _symbol: any
- :ivar _symbol: The node value corresponding to this
- ``Nonterminal``. This value must be immutable and hashable.
- """
- def __init__(self, symbol):
- """
- Construct a new non-terminal from the given symbol.
- :type symbol: any
- :param symbol: The node value corresponding to this
- ``Nonterminal``. This value must be immutable and
- hashable.
- """
- self._symbol = symbol
- self._hash = hash(symbol)
- def symbol(self):
- """
- Return the node value corresponding to this ``Nonterminal``.
- :rtype: (any)
- """
- return self._symbol
- def __eq__(self, other):
- """
- Return True if this non-terminal is equal to ``other``. In
- particular, return True if ``other`` is a ``Nonterminal``
- and this non-terminal's symbol is equal to ``other`` 's symbol.
- :rtype: bool
- """
- return type(self) == type(other) and self._symbol == other._symbol
- def __ne__(self, other):
- return not self == other
- def __lt__(self, other):
- if not isinstance(other, Nonterminal):
- raise_unorderable_types("<", self, other)
- return self._symbol < other._symbol
- def __hash__(self):
- return self._hash
- def __repr__(self):
- """
- Return a string representation for this ``Nonterminal``.
- :rtype: str
- """
- if isinstance(self._symbol, string_types):
- return '%s' % self._symbol
- else:
- return '%s' % unicode_repr(self._symbol)
- def __str__(self):
- """
- Return a string representation for this ``Nonterminal``.
- :rtype: str
- """
- if isinstance(self._symbol, string_types):
- return '%s' % self._symbol
- else:
- return '%s' % unicode_repr(self._symbol)
- def __div__(self, rhs):
- """
- Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
- the symbol for this nonterminal, and ``B`` is the symbol for rhs.
- :param rhs: The nonterminal used to form the right hand side
- of the new nonterminal.
- :type rhs: Nonterminal
- :rtype: Nonterminal
- """
- return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
- def __truediv__(self, rhs):
- """
- Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
- the symbol for this nonterminal, and ``B`` is the symbol for rhs.
- This function allows use of the slash ``/`` operator with
- the future import of division.
- :param rhs: The nonterminal used to form the right hand side
- of the new nonterminal.
- :type rhs: Nonterminal
- :rtype: Nonterminal
- """
- return self.__div__(rhs)
- def nonterminals(symbols):
- """
- Given a string containing a list of symbol names, return a list of
- ``Nonterminals`` constructed from those symbols.
- :param symbols: The symbol name string. This string can be
- delimited by either spaces or commas.
- :type symbols: str
- :return: A list of ``Nonterminals`` constructed from the symbol
- names given in ``symbols``. The ``Nonterminals`` are sorted
- in the same order as the symbols names.
- :rtype: list(Nonterminal)
- """
- if ',' in symbols:
- symbol_list = symbols.split(',')
- else:
- symbol_list = symbols.split()
- return [Nonterminal(s.strip()) for s in symbol_list]
- class FeatStructNonterminal(FeatDict, Nonterminal):
- """A feature structure that's also a nonterminal. It acts as its
- own symbol, and automatically freezes itself when hashed."""
- def __hash__(self):
- self.freeze()
- return FeatStruct.__hash__(self)
- def symbol(self):
- return self
- def is_nonterminal(item):
- """
- :return: True if the item is a ``Nonterminal``.
- :rtype: bool
- """
- return isinstance(item, Nonterminal)
- #################################################################
- # Terminals
- #################################################################
- def is_terminal(item):
- """
- Return True if the item is a terminal, which currently is
- if it is hashable and not a ``Nonterminal``.
- :rtype: bool
- """
- return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)
- #################################################################
- # Productions
- #################################################################
- @total_ordering
- @python_2_unicode_compatible
- class Production(object):
- """
- A grammar production. Each production maps a single symbol
- on the "left-hand side" to a sequence of symbols on the
- "right-hand side". (In the case of context-free productions,
- the left-hand side must be a ``Nonterminal``, and the right-hand
- side is a sequence of terminals and ``Nonterminals``.)
- "terminals" can be any immutable hashable object that is
- not a ``Nonterminal``. Typically, terminals are strings
- representing words, such as ``"dog"`` or ``"under"``.
- :see: ``CFG``
- :see: ``DependencyGrammar``
- :see: ``Nonterminal``
- :type _lhs: Nonterminal
- :ivar _lhs: The left-hand side of the production.
- :type _rhs: tuple(Nonterminal, terminal)
- :ivar _rhs: The right-hand side of the production.
- """
- def __init__(self, lhs, rhs):
- """
- Construct a new ``Production``.
- :param lhs: The left-hand side of the new ``Production``.
- :type lhs: Nonterminal
- :param rhs: The right-hand side of the new ``Production``.
- :type rhs: sequence(Nonterminal and terminal)
- """
- if isinstance(rhs, string_types):
- raise TypeError(
- 'production right hand side should be a list, ' 'not a string'
- )
- self._lhs = lhs
- self._rhs = tuple(rhs)
- self._hash = hash((self._lhs, self._rhs))
- def lhs(self):
- """
- Return the left-hand side of this ``Production``.
- :rtype: Nonterminal
- """
- return self._lhs
- def rhs(self):
- """
- Return the right-hand side of this ``Production``.
- :rtype: sequence(Nonterminal and terminal)
- """
- return self._rhs
- def __len__(self):
- """
- Return the length of the right-hand side.
- :rtype: int
- """
- return len(self._rhs)
- def is_nonlexical(self):
- """
- Return True if the right-hand side only contains ``Nonterminals``
- :rtype: bool
- """
- return all(is_nonterminal(n) for n in self._rhs)
- def is_lexical(self):
- """
- Return True if the right-hand contain at least one terminal token.
- :rtype: bool
- """
- return not self.is_nonlexical()
- def __str__(self):
- """
- Return a verbose string representation of the ``Production``.
- :rtype: str
- """
- result = '%s -> ' % unicode_repr(self._lhs)
- result += " ".join(unicode_repr(el) for el in self._rhs)
- return result
- def __repr__(self):
- """
- Return a concise string representation of the ``Production``.
- :rtype: str
- """
- return '%s' % self
- def __eq__(self, other):
- """
- Return True if this ``Production`` is equal to ``other``.
- :rtype: bool
- """
- return (
- type(self) == type(other)
- and self._lhs == other._lhs
- and self._rhs == other._rhs
- )
- def __ne__(self, other):
- return not self == other
- def __lt__(self, other):
- if not isinstance(other, Production):
- raise_unorderable_types("<", self, other)
- return (self._lhs, self._rhs) < (other._lhs, other._rhs)
- def __hash__(self):
- """
- Return a hash value for the ``Production``.
- :rtype: int
- """
- return self._hash
- @python_2_unicode_compatible
- class DependencyProduction(Production):
- """
- A dependency grammar production. Each production maps a single
- head word to an unordered list of one or more modifier words.
- """
- def __str__(self):
- """
- Return a verbose string representation of the ``DependencyProduction``.
- :rtype: str
- """
- result = '\'%s\' ->' % (self._lhs,)
- for elt in self._rhs:
- result += ' \'%s\'' % (elt,)
- return result
- @python_2_unicode_compatible
- class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
- """
- A probabilistic context free grammar production.
- A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
- has an associated probability, which represents how likely it is that
- this production will be used. In particular, the probability of a
- ``ProbabilisticProduction`` records the likelihood that its right-hand side is
- the correct instantiation for any given occurrence of its left-hand side.
- :see: ``Production``
- """
- def __init__(self, lhs, rhs, **prob):
- """
- Construct a new ``ProbabilisticProduction``.
- :param lhs: The left-hand side of the new ``ProbabilisticProduction``.
- :type lhs: Nonterminal
- :param rhs: The right-hand side of the new ``ProbabilisticProduction``.
- :type rhs: sequence(Nonterminal and terminal)
- :param prob: Probability parameters of the new ``ProbabilisticProduction``.
- """
- ImmutableProbabilisticMixIn.__init__(self, **prob)
- Production.__init__(self, lhs, rhs)
- def __str__(self):
- return Production.__unicode__(self) + (
- ' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob()
- )
- def __eq__(self, other):
- return (
- type(self) == type(other)
- and self._lhs == other._lhs
- and self._rhs == other._rhs
- and self.prob() == other.prob()
- )
- def __ne__(self, other):
- return not self == other
- def __hash__(self):
- return hash((self._lhs, self._rhs, self.prob()))
- #################################################################
- # Grammars
- #################################################################
- @python_2_unicode_compatible
- class CFG(object):
- """
- A context-free grammar. A grammar consists of a start state and
- a set of productions. The set of terminals and nonterminals is
- implicitly specified by the productions.
- If you need efficient key-based access to productions, you
- can use a subclass to implement it.
- """
- def __init__(self, start, productions, calculate_leftcorners=True):
- """
- Create a new context-free grammar, from the given start state
- and set of ``Production``s.
- :param start: The start symbol
- :type start: Nonterminal
- :param productions: The list of productions that defines the grammar
- :type productions: list(Production)
- :param calculate_leftcorners: False if we don't want to calculate the
- leftcorner relation. In that case, some optimized chart parsers won't work.
- :type calculate_leftcorners: bool
- """
- if not is_nonterminal(start):
- raise TypeError(
- "start should be a Nonterminal object,"
- " not a %s" % type(start).__name__
- )
- self._start = start
- self._productions = productions
- self._categories = set(prod.lhs() for prod in productions)
- self._calculate_indexes()
- self._calculate_grammar_forms()
- if calculate_leftcorners:
- self._calculate_leftcorners()
- def _calculate_indexes(self):
- self._lhs_index = {}
- self._rhs_index = {}
- self._empty_index = {}
- self._lexical_index = {}
- for prod in self._productions:
- # Left hand side.
- lhs = prod._lhs
- if lhs not in self._lhs_index:
- self._lhs_index[lhs] = []
- self._lhs_index[lhs].append(prod)
- if prod._rhs:
- # First item in right hand side.
- rhs0 = prod._rhs[0]
- if rhs0 not in self._rhs_index:
- self._rhs_index[rhs0] = []
- self._rhs_index[rhs0].append(prod)
- else:
- # The right hand side is empty.
- self._empty_index[prod.lhs()] = prod
- # Lexical tokens in the right hand side.
- for token in prod._rhs:
- if is_terminal(token):
- self._lexical_index.setdefault(token, set()).add(prod)
- def _calculate_leftcorners(self):
- # Calculate leftcorner relations, for use in optimized parsing.
- self._immediate_leftcorner_categories = dict(
- (cat, set([cat])) for cat in self._categories
- )
- self._immediate_leftcorner_words = dict(
- (cat, set()) for cat in self._categories
- )
- for prod in self.productions():
- if len(prod) > 0:
- cat, left = prod.lhs(), prod.rhs()[0]
- if is_nonterminal(left):
- self._immediate_leftcorner_categories[cat].add(left)
- else:
- self._immediate_leftcorner_words[cat].add(left)
- lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True)
- self._leftcorners = lc
- self._leftcorner_parents = invert_graph(lc)
- nr_leftcorner_categories = sum(
- map(len, self._immediate_leftcorner_categories.values())
- )
- nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values()))
- if nr_leftcorner_words > nr_leftcorner_categories > 10000:
- # If the grammar is big, the leftcorner-word dictionary will be too large.
- # In that case it is better to calculate the relation on demand.
- self._leftcorner_words = None
- return
- self._leftcorner_words = {}
- for cat in self._leftcorners:
- lefts = self._leftcorners[cat]
- lc = self._leftcorner_words[cat] = set()
- for left in lefts:
- lc.update(self._immediate_leftcorner_words.get(left, set()))
- @classmethod
- def fromstring(cls, input, encoding=None):
- """
- Return the grammar instance corresponding to the input string(s).
- :param input: a grammar, either in the form of a string or as a list of strings.
- """
- start, productions = read_grammar(
- input, standard_nonterm_parser, encoding=encoding
- )
- return cls(start, productions)
- def start(self):
- """
- Return the start symbol of the grammar
- :rtype: Nonterminal
- """
- return self._start
- # tricky to balance readability and efficiency here!
- # can't use set operations as they don't preserve ordering
- def productions(self, lhs=None, rhs=None, empty=False):
- """
- Return the grammar productions, filtered by the left-hand side
- or the first item in the right-hand side.
- :param lhs: Only return productions with the given left-hand side.
- :param rhs: Only return productions with the given first item
- in the right-hand side.
- :param empty: Only return productions with an empty right-hand side.
- :return: A list of productions matching the given constraints.
- :rtype: list(Production)
- """
- if rhs and empty:
- raise ValueError(
- "You cannot select empty and non-empty " "productions at the same time."
- )
- # no constraints so return everything
- if not lhs and not rhs:
- if not empty:
- return self._productions
- else:
- return self._empty_index.values()
- # only lhs specified so look up its index
- elif lhs and not rhs:
- if not empty:
- return self._lhs_index.get(lhs, [])
- elif lhs in self._empty_index:
- return [self._empty_index[lhs]]
- else:
- return []
- # only rhs specified so look up its index
- elif rhs and not lhs:
- return self._rhs_index.get(rhs, [])
- # intersect
- else:
- return [
- prod
- for prod in self._lhs_index.get(lhs, [])
- if prod in self._rhs_index.get(rhs, [])
- ]
- def leftcorners(self, cat):
- """
- Return the set of all nonterminals that the given nonterminal
- can start with, including itself.
- This is the reflexive, transitive closure of the immediate
- leftcorner relation: (A > B) iff (A -> B beta)
- :param cat: the parent of the leftcorners
- :type cat: Nonterminal
- :return: the set of all leftcorners
- :rtype: set(Nonterminal)
- """
- return self._leftcorners.get(cat, set([cat]))
- def is_leftcorner(self, cat, left):
- """
- True if left is a leftcorner of cat, where left can be a
- terminal or a nonterminal.
- :param cat: the parent of the leftcorner
- :type cat: Nonterminal
- :param left: the suggested leftcorner
- :type left: Terminal or Nonterminal
- :rtype: bool
- """
- if is_nonterminal(left):
- return left in self.leftcorners(cat)
- elif self._leftcorner_words:
- return left in self._leftcorner_words.get(cat, set())
- else:
- return any(
- left in self._immediate_leftcorner_words.get(parent, set())
- for parent in self.leftcorners(cat)
- )
- def leftcorner_parents(self, cat):
- """
- Return the set of all nonterminals for which the given category
- is a left corner. This is the inverse of the leftcorner relation.
- :param cat: the suggested leftcorner
- :type cat: Nonterminal
- :return: the set of all parents to the leftcorner
- :rtype: set(Nonterminal)
- """
- return self._leftcorner_parents.get(cat, set([cat]))
- def check_coverage(self, tokens):
- """
- Check whether the grammar rules cover the given list of tokens.
- If not, then raise an exception.
- :type tokens: list(str)
- """
- missing = [tok for tok in tokens if not self._lexical_index.get(tok)]
- if missing:
- missing = ', '.join('%r' % (w,) for w in missing)
- raise ValueError(
- "Grammar does not cover some of the " "input words: %r." % missing
- )
- def _calculate_grammar_forms(self):
- """
- Pre-calculate of which form(s) the grammar is.
- """
- prods = self._productions
- self._is_lexical = all(p.is_lexical() for p in prods)
- self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1)
- self._min_len = min(len(p) for p in prods)
- self._max_len = max(len(p) for p in prods)
- self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1)
- def is_lexical(self):
- """
- Return True if all productions are lexicalised.
- """
- return self._is_lexical
- def is_nonlexical(self):
- """
- Return True if all lexical rules are "preterminals", that is,
- unary rules which can be separated in a preprocessing step.
- This means that all productions are of the forms
- A -> B1 ... Bn (n>=0), or A -> "s".
- Note: is_lexical() and is_nonlexical() are not opposites.
- There are grammars which are neither, and grammars which are both.
- """
- return self._is_nonlexical
- def min_len(self):
- """
- Return the right-hand side length of the shortest grammar production.
- """
- return self._min_len
- def max_len(self):
- """
- Return the right-hand side length of the longest grammar production.
- """
- return self._max_len
- def is_nonempty(self):
- """
- Return True if there are no empty productions.
- """
- return self._min_len > 0
- def is_binarised(self):
- """
- Return True if all productions are at most binary.
- Note that there can still be empty and unary productions.
- """
- return self._max_len <= 2
- def is_flexible_chomsky_normal_form(self):
- """
- Return True if all productions are of the forms
- A -> B C, A -> B, or A -> "s".
- """
- return self.is_nonempty() and self.is_nonlexical() and self.is_binarised()
- def is_chomsky_normal_form(self):
- """
- Return True if the grammar is of Chomsky Normal Form, i.e. all productions
- are of the form A -> B C, or A -> "s".
- """
- return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical
- def chomsky_normal_form(self, new_token_padding='@$@', flexible=False):
- """
- Returns a new Grammer that is in chomsky normal
- :param: new_token_padding
- Customise new rule formation during binarisation
- """
- if self.is_chomsky_normal_form():
- return
- if self.productions(empty=True):
- raise ValueError(('Grammar has Empty rules. '
- 'Cannot deal with them at the moment'))
- # check for mixed rules
- for rule in self.productions():
- if rule.is_lexical() and len(rule.rhs()) > 1:
- raise ValueError(
- 'Cannot handled mixed rule {} => {}'.format(rule.lhs(),
- rule.rhs()))
- step1 = CFG.eliminate_start(self)
- step2 = CFG.binarize(step1, new_token_padding)
- if flexible:
- return step2
- step3 = CFG.remove_unitary_rules(step2)
- return step3
- @classmethod
- def remove_unitary_rules(cls, grammar):
- """
- Remove nonlexical unitary rules and convert them to
- lexical
- """
- result = []
- unitary = []
- for rule in grammar.productions():
- if len(rule) == 1 and rule.is_nonlexical():
- unitary.append(rule)
- else:
- result.append(rule)
- while unitary:
- rule = unitary.pop(0)
- for item in grammar.productions(lhs=rule.rhs()[0]):
- new_rule = Production(rule.lhs(), item.rhs())
- if len(new_rule) != 1 or new_rule.is_lexical():
- result.append(new_rule)
- else:
- unitary.append(new_rule)
- n_grammar = CFG(grammar.start(), result)
- return n_grammar
- @classmethod
- def binarize(cls, grammar, padding='@$@'):
- """
- Convert all non-binary rules into binary by introducing
- new tokens.
- Example::
- Original:
- A => B C D
- After Conversion:
- A => B A@$@B
- A@$@B => C D
- """
- result = []
- for rule in grammar.productions():
- if len(rule.rhs()) > 2:
- # this rule needs to be broken down
- left_side = rule.lhs()
- for k in range(0, len(rule.rhs()) - 2):
- tsym = rule.rhs()[k]
- new_sym = Nonterminal(
- left_side.symbol() + padding + tsym.symbol()
- )
- new_production = Production(left_side, (tsym, new_sym))
- left_side = new_sym
- result.append(new_production)
- last_prd = Production(left_side, rule.rhs()[-2:])
- result.append(last_prd)
- else:
- result.append(rule)
- n_grammar = CFG(grammar.start(), result)
- return n_grammar
- @classmethod
- def eliminate_start(cls, grammar):
- """
- Eliminate start rule in case it appears on RHS
- Example: S -> S0 S1 and S0 -> S1 S
- Then another rule S0_Sigma -> S is added
- """
- start = grammar.start()
- result = []
- need_to_add = None
- for rule in grammar.productions():
- if start in rule.rhs():
- need_to_add = True
- result.append(rule)
- if need_to_add:
- start = Nonterminal('S0_SIGMA')
- result.append(Production(start, grammar.start()))
- n_grammar = CFG(start, result)
- return n_grammar
- return grammar
- def __repr__(self):
- return '<Grammar with %d productions>' % len(self._productions)
- def __str__(self):
- result = 'Grammar with %d productions' % len(self._productions)
- result += ' (start state = %r)' % self._start
- for production in self._productions:
- result += '\n %s' % production
- return result
- class FeatureGrammar(CFG):
- """
- A feature-based grammar. This is equivalent to a
- ``CFG`` whose nonterminals are all
- ``FeatStructNonterminal``.
- A grammar consists of a start state and a set of
- productions. The set of terminals and nonterminals
- is implicitly specified by the productions.
- """
- def __init__(self, start, productions):
- """
- Create a new feature-based grammar, from the given start
- state and set of ``Productions``.
- :param start: The start symbol
- :type start: FeatStructNonterminal
- :param productions: The list of productions that defines the grammar
- :type productions: list(Production)
- """
- CFG.__init__(self, start, productions)
- # The difference with CFG is that the productions are
- # indexed on the TYPE feature of the nonterminals.
- # This is calculated by the method _get_type_if_possible().
- def _calculate_indexes(self):
- self._lhs_index = {}
- self._rhs_index = {}
- self._empty_index = {}
- self._empty_productions = []
- self._lexical_index = {}
- for prod in self._productions:
- # Left hand side.
- lhs = self._get_type_if_possible(prod._lhs)
- if lhs not in self._lhs_index:
- self._lhs_index[lhs] = []
- self._lhs_index[lhs].append(prod)
- if prod._rhs:
- # First item in right hand side.
- rhs0 = self._get_type_if_possible(prod._rhs[0])
- if rhs0 not in self._rhs_index:
- self._rhs_index[rhs0] = []
- self._rhs_index[rhs0].append(prod)
- else:
- # The right hand side is empty.
- if lhs not in self._empty_index:
- self._empty_index[lhs] = []
- self._empty_index[lhs].append(prod)
- self._empty_productions.append(prod)
- # Lexical tokens in the right hand side.
- for token in prod._rhs:
- if is_terminal(token):
- self._lexical_index.setdefault(token, set()).add(prod)
- @classmethod
- def fromstring(
- cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None
- ):
- """
- Return a feature structure based grammar.
- :param input: a grammar, either in the form of a string or else
- as a list of strings.
- :param features: a tuple of features (default: SLASH, TYPE)
- :param logic_parser: a parser for lambda-expressions,
- by default, ``LogicParser()``
- :param fstruct_reader: a feature structure parser
- (only if features and logic_parser is None)
- """
- if features is None:
- features = (SLASH, TYPE)
- if fstruct_reader is None:
- fstruct_reader = FeatStructReader(
- features, FeatStructNonterminal, logic_parser=logic_parser
- )
- elif logic_parser is not None:
- raise Exception(
- '\'logic_parser\' and \'fstruct_reader\' must ' 'not both be set'
- )
- start, productions = read_grammar(
- input, fstruct_reader.read_partial, encoding=encoding
- )
- return cls(start, productions)
- def productions(self, lhs=None, rhs=None, empty=False):
- """
- Return the grammar productions, filtered by the left-hand side
- or the first item in the right-hand side.
- :param lhs: Only return productions with the given left-hand side.
- :param rhs: Only return productions with the given first item
- in the right-hand side.
- :param empty: Only return productions with an empty right-hand side.
- :rtype: list(Production)
- """
- if rhs and empty:
- raise ValueError(
- "You cannot select empty and non-empty " "productions at the same time."
- )
- # no constraints so return everything
- if not lhs and not rhs:
- if empty:
- return self._empty_productions
- else:
- return self._productions
- # only lhs specified so look up its index
- elif lhs and not rhs:
- if empty:
- return self._empty_index.get(self._get_type_if_possible(lhs), [])
- else:
- return self._lhs_index.get(self._get_type_if_possible(lhs), [])
- # only rhs specified so look up its index
- elif rhs and not lhs:
- return self._rhs_index.get(self._get_type_if_possible(rhs), [])
- # intersect
- else:
- return [
- prod
- for prod in self._lhs_index.get(self._get_type_if_possible(lhs), [])
- if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])
- ]
- def leftcorners(self, cat):
- """
- Return the set of all words that the given category can start with.
- Also called the "first set" in compiler construction.
- """
- raise NotImplementedError("Not implemented yet")
- def leftcorner_parents(self, cat):
- """
- Return the set of all categories for which the given category
- is a left corner.
- """
- raise NotImplementedError("Not implemented yet")
- def _get_type_if_possible(self, item):
- """
- Helper function which returns the ``TYPE`` feature of the ``item``,
- if it exists, otherwise it returns the ``item`` itself
- """
- if isinstance(item, dict) and TYPE in item:
- return FeatureValueType(item[TYPE])
- else:
- return item
- @total_ordering
- @python_2_unicode_compatible
- class FeatureValueType(object):
- """
- A helper class for ``FeatureGrammars``, designed to be different
- from ordinary strings. This is to stop the ``FeatStruct``
- ``FOO[]`` from being compare equal to the terminal "FOO".
- """
- def __init__(self, value):
- self._value = value
- self._hash = hash(value)
- def __repr__(self):
- return '<%s>' % self._value
- def __eq__(self, other):
- return type(self) == type(other) and self._value == other._value
- def __ne__(self, other):
- return not self == other
- def __lt__(self, other):
- if not isinstance(other, FeatureValueType):
- raise_unorderable_types("<", self, other)
- return self._value < other._value
- def __hash__(self):
- return self._hash
- @python_2_unicode_compatible
- class DependencyGrammar(object):
- """
- A dependency grammar. A DependencyGrammar consists of a set of
- productions. Each production specifies a head/modifier relationship
- between a pair of words.
- """
- def __init__(self, productions):
- """
- Create a new dependency grammar, from the set of ``Productions``.
- :param productions: The list of productions that defines the grammar
- :type productions: list(Production)
- """
- self._productions = productions
- @classmethod
- def fromstring(cls, input):
- productions = []
- for linenum, line in enumerate(input.split('\n')):
- line = line.strip()
- if line.startswith('#') or line == '':
- continue
- try:
- productions += _read_dependency_production(line)
- except ValueError:
- raise ValueError('Unable to parse line %s: %s' % (linenum, line))
- if len(productions) == 0:
- raise ValueError('No productions found!')
- return cls(productions)
- def contains(self, head, mod):
- """
- :param head: A head word.
- :type head: str
- :param mod: A mod word, to test as a modifier of 'head'.
- :type mod: str
- :return: true if this ``DependencyGrammar`` contains a
- ``DependencyProduction`` mapping 'head' to 'mod'.
- :rtype: bool
- """
- for production in self._productions:
- for possibleMod in production._rhs:
- if production._lhs == head and possibleMod == mod:
- return True
- return False
- def __contains__(self, head, mod):
- """
- Return True if this ``DependencyGrammar`` contains a
- ``DependencyProduction`` mapping 'head' to 'mod'.
- :param head: A head word.
- :type head: str
- :param mod: A mod word, to test as a modifier of 'head'.
- :type mod: str
- :rtype: bool
- """
- for production in self._productions:
- for possibleMod in production._rhs:
- if production._lhs == head and possibleMod == mod:
- return True
- return False
- # # should be rewritten, the set comp won't work in all comparisons
- # def contains_exactly(self, head, modlist):
- # for production in self._productions:
- # if(len(production._rhs) == len(modlist)):
- # if(production._lhs == head):
- # set1 = Set(production._rhs)
- # set2 = Set(modlist)
- # if(set1 == set2):
- # return True
- # return False
- def __str__(self):
- """
- Return a verbose string representation of the ``DependencyGrammar``
- :rtype: str
- """
- str = 'Dependency grammar with %d productions' % len(self._productions)
- for production in self._productions:
- str += '\n %s' % production
- return str
- def __repr__(self):
- """
- Return a concise string representation of the ``DependencyGrammar``
- """
- return 'Dependency grammar with %d productions' % len(self._productions)
- @python_2_unicode_compatible
- class ProbabilisticDependencyGrammar(object):
- """
- """
- def __init__(self, productions, events, tags):
- self._productions = productions
- self._events = events
- self._tags = tags
- def contains(self, head, mod):
- """
- Return True if this ``DependencyGrammar`` contains a
- ``DependencyProduction`` mapping 'head' to 'mod'.
- :param head: A head word.
- :type head: str
- :param mod: A mod word, to test as a modifier of 'head'.
- :type mod: str
- :rtype: bool
- """
- for production in self._productions:
- for possibleMod in production._rhs:
- if production._lhs == head and possibleMod == mod:
- return True
- return False
- def __str__(self):
- """
- Return a verbose string representation of the ``ProbabilisticDependencyGrammar``
- :rtype: str
- """
- str = 'Statistical dependency grammar with %d productions' % len(
- self._productions
- )
- for production in self._productions:
- str += '\n %s' % production
- str += '\nEvents:'
- for event in self._events:
- str += '\n %d:%s' % (self._events[event], event)
- str += '\nTags:'
- for tag_word in self._tags:
- str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word])
- return str
- def __repr__(self):
- """
- Return a concise string representation of the ``ProbabilisticDependencyGrammar``
- """
- return 'Statistical Dependency grammar with %d productions' % len(
- self._productions
- )
- class PCFG(CFG):
- """
- A probabilistic context-free grammar. A PCFG consists of a
- start state and a set of productions with probabilities. The set of
- terminals and nonterminals is implicitly specified by the productions.
- PCFG productions use the ``ProbabilisticProduction`` class.
- ``PCFGs`` impose the constraint that the set of productions with
- any given left-hand-side must have probabilities that sum to 1
- (allowing for a small margin of error).
- If you need efficient key-based access to productions, you can use
- a subclass to implement it.
- :type EPSILON: float
- :cvar EPSILON: The acceptable margin of error for checking that
- productions with a given left-hand side have probabilities
- that sum to 1.
- """
- EPSILON = 0.01
- def __init__(self, start, productions, calculate_leftcorners=True):
- """
- Create a new context-free grammar, from the given start state
- and set of ``ProbabilisticProductions``.
- :param start: The start symbol
- :type start: Nonterminal
- :param productions: The list of productions that defines the grammar
- :type productions: list(Production)
- :raise ValueError: if the set of productions with any left-hand-side
- do not have probabilities that sum to a value within
- EPSILON of 1.
- :param calculate_leftcorners: False if we don't want to calculate the
- leftcorner relation. In that case, some optimized chart parsers won't work.
- :type calculate_leftcorners: bool
- """
- CFG.__init__(self, start, productions, calculate_leftcorners)
- # Make sure that the probabilities sum to one.
- probs = {}
- for production in productions:
- probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob()
- for (lhs, p) in probs.items():
- if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)):
- raise ValueError("Productions for %r do not sum to 1" % lhs)
- @classmethod
- def fromstring(cls, input, encoding=None):
- """
- Return a probabilistic context-free grammar corresponding to the
- input string(s).
- :param input: a grammar, either in the form of a string or else
- as a list of strings.
- """
- start, productions = read_grammar(
- input, standard_nonterm_parser, probabilistic=True, encoding=encoding
- )
- return cls(start, productions)
- #################################################################
- # Inducing Grammars
- #################################################################
- # Contributed by Nathan Bodenstab <bodenstab@cslu.ogi.edu>
- def induce_pcfg(start, productions):
- """
- Induce a PCFG grammar from a list of productions.
- The probability of a production A -> B C in a PCFG is:
- | count(A -> B C)
- | P(B, C | A) = --------------- where \* is any right hand side
- | count(A -> \*)
- :param start: The start symbol
- :type start: Nonterminal
- :param productions: The list of productions that defines the grammar
- :type productions: list(Production)
- """
- # Production count: the number of times a given production occurs
- pcount = {}
- # LHS-count: counts the number of times a given lhs occurs
- lcount = {}
- for prod in productions:
- lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
- pcount[prod] = pcount.get(prod, 0) + 1
- prods = [
- ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()])
- for p in pcount
- ]
- return PCFG(start, prods)
- #################################################################
- # Helper functions for reading productions
- #################################################################
- def _read_cfg_production(input):
- """
- Return a list of context-free ``Productions``.
- """
- return _read_production(input, standard_nonterm_parser)
- def _read_pcfg_production(input):
- """
- Return a list of PCFG ``ProbabilisticProductions``.
- """
- return _read_production(input, standard_nonterm_parser, probabilistic=True)
- def _read_fcfg_production(input, fstruct_reader):
- """
- Return a list of feature-based ``Productions``.
- """
- return _read_production(input, fstruct_reader)
- # Parsing generic grammars
- _ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE)
- _PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE)
- _TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
- _DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)
- def _read_production(line, nonterm_parser, probabilistic=False):
- """
- Parse a grammar rule, given as a string, and return
- a list of productions.
- """
- pos = 0
- # Parse the left-hand side.
- lhs, pos = nonterm_parser(line, pos)
- # Skip over the arrow.
- m = _ARROW_RE.match(line, pos)
- if not m:
- raise ValueError('Expected an arrow')
- pos = m.end()
- # Parse the right hand side.
- probabilities = [0.0]
- rhsides = [[]]
- while pos < len(line):
- # Probability.
- m = _PROBABILITY_RE.match(line, pos)
- if probabilistic and m:
- pos = m.end()
- probabilities[-1] = float(m.group(1)[1:-1])
- if probabilities[-1] > 1.0:
- raise ValueError(
- 'Production probability %f, '
- 'should not be greater than 1.0' % (probabilities[-1],)
- )
- # String -- add terminal.
- elif line[pos] in "\'\"":
- m = _TERMINAL_RE.match(line, pos)
- if not m:
- raise ValueError('Unterminated string')
- rhsides[-1].append(m.group(1)[1:-1])
- pos = m.end()
- # Vertical bar -- start new rhside.
- elif line[pos] == '|':
- m = _DISJUNCTION_RE.match(line, pos)
- probabilities.append(0.0)
- rhsides.append([])
- pos = m.end()
- # Anything else -- nonterminal.
- else:
- nonterm, pos = nonterm_parser(line, pos)
- rhsides[-1].append(nonterm)
- if probabilistic:
- return [
- ProbabilisticProduction(lhs, rhs, prob=probability)
- for (rhs, probability) in zip(rhsides, probabilities)
- ]
- else:
- return [Production(lhs, rhs) for rhs in rhsides]
- #################################################################
- # Reading Phrase Structure Grammars
- #################################################################
- def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
- """
- Return a pair consisting of a starting category and a list of
- ``Productions``.
- :param input: a grammar, either in the form of a string or else
- as a list of strings.
- :param nonterm_parser: a function for parsing nonterminals.
- It should take a ``(string, position)`` as argument and
- return a ``(nonterminal, position)`` as result.
- :param probabilistic: are the grammar rules probabilistic?
- :type probabilistic: bool
- :param encoding: the encoding of the grammar, if it is a binary string
- :type encoding: str
- """
- if encoding is not None:
- input = input.decode(encoding)
- if isinstance(input, string_types):
- lines = input.split('\n')
- else:
- lines = input
- start = None
- productions = []
- continue_line = ''
- for linenum, line in enumerate(lines):
- line = continue_line + line.strip()
- if line.startswith('#') or line == '':
- continue
- if line.endswith('\\'):
- continue_line = line[:-1].rstrip() + ' '
- continue
- continue_line = ''
- try:
- if line[0] == '%':
- directive, args = line[1:].split(None, 1)
- if directive == 'start':
- start, pos = nonterm_parser(args, 0)
- if pos != len(args):
- raise ValueError('Bad argument to start directive')
- else:
- raise ValueError('Bad directive')
- else:
- # expand out the disjunctions on the RHS
- productions += _read_production(line, nonterm_parser, probabilistic)
- except ValueError as e:
- raise ValueError('Unable to parse line %s: %s\n%s' % (linenum + 1, line, e))
- if not productions:
- raise ValueError('No productions found!')
- if not start:
- start = productions[0].lhs()
- return (start, productions)
- _STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
- def standard_nonterm_parser(string, pos):
- m = _STANDARD_NONTERM_RE.match(string, pos)
- if not m:
- raise ValueError('Expected a nonterminal, found: ' + string[pos:])
- return (Nonterminal(m.group(1)), m.end())
- #################################################################
- # Reading Dependency Grammars
- #################################################################
- _READ_DG_RE = re.compile(
- r'''^\s* # leading whitespace
- ('[^']+')\s* # single-quoted lhs
- (?:[-=]+>)\s* # arrow
- (?:( # rhs:
- "[^"]+" # doubled-quoted terminal
- | '[^']+' # single-quoted terminal
- | \| # disjunction
- )
- \s*) # trailing space
- *$''', # zero or more copies
- re.VERBOSE,
- )
- _SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')
- def _read_dependency_production(s):
- if not _READ_DG_RE.match(s):
- raise ValueError('Bad production string')
- pieces = _SPLIT_DG_RE.split(s)
- pieces = [p for i, p in enumerate(pieces) if i % 2 == 1]
- lhside = pieces[0].strip('\'\"')
- rhsides = [[]]
- for piece in pieces[2:]:
- if piece == '|':
- rhsides.append([])
- else:
- rhsides[-1].append(piece.strip('\'\"'))
- return [DependencyProduction(lhside, rhside) for rhside in rhsides]
- #################################################################
- # Demonstration
- #################################################################
- def cfg_demo():
- """
- A demonstration showing how ``CFGs`` can be created and used.
- """
- from nltk import nonterminals, Production, CFG
- # Create some nonterminals
- S, NP, VP, PP = nonterminals('S, NP, VP, PP')
- N, V, P, Det = nonterminals('N, V, P, Det')
- VP_slash_NP = VP / NP
- print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP])
- print(' S.symbol() =>', repr(S.symbol()))
- print()
- print(Production(S, [NP]))
- # Create some Grammar Productions
- grammar = CFG.fromstring(
- """
- S -> NP VP
- PP -> P NP
- NP -> Det N | NP PP
- VP -> V NP | VP PP
- Det -> 'a' | 'the'
- N -> 'dog' | 'cat'
- V -> 'chased' | 'sat'
- P -> 'on' | 'in'
- """
- )
- print('A Grammar:', repr(grammar))
- print(' grammar.start() =>', repr(grammar.start()))
- print(' grammar.productions() =>', end=' ')
- # Use string.replace(...) is to line-wrap the output.
- print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 25))
- print()
- toy_pcfg1 = PCFG.fromstring(
- """
- S -> NP VP [1.0]
- NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
- Det -> 'the' [0.8] | 'my' [0.2]
- N -> 'man' [0.5] | 'telescope' [0.5]
- VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
- V -> 'ate' [0.35] | 'saw' [0.65]
- PP -> P NP [1.0]
- P -> 'with' [0.61] | 'under' [0.39]
- """
- )
- toy_pcfg2 = PCFG.fromstring(
- """
- S -> NP VP [1.0]
- VP -> V NP [.59]
- VP -> V [.40]
- VP -> VP PP [.01]
- NP -> Det N [.41]
- NP -> Name [.28]
- NP -> NP PP [.31]
- PP -> P NP [1.0]
- V -> 'saw' [.21]
- V -> 'ate' [.51]
- V -> 'ran' [.28]
- N -> 'boy' [.11]
- N -> 'cookie' [.12]
- N -> 'table' [.13]
- N -> 'telescope' [.14]
- N -> 'hill' [.5]
- Name -> 'Jack' [.52]
- Name -> 'Bob' [.48]
- P -> 'with' [.61]
- P -> 'under' [.39]
- Det -> 'the' [.41]
- Det -> 'a' [.31]
- Det -> 'my' [.28]
- """
- )
- def pcfg_demo():
- """
- A demonstration showing how a ``PCFG`` can be created and used.
- """
- from nltk.corpus import treebank
- from nltk import treetransforms
- from nltk import induce_pcfg
- from nltk.parse import pchart
- pcfg_prods = toy_pcfg1.productions()
- pcfg_prod = pcfg_prods[2]
- print('A PCFG production:', repr(pcfg_prod))
- print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs()))
- print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs()))
- print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
- print()
- grammar = toy_pcfg2
- print('A PCFG grammar:', repr(grammar))
- print(' grammar.start() =>', repr(grammar.start()))
- print(' grammar.productions() =>', end=' ')
- # Use .replace(...) is to line-wrap the output.
- print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
- print()
- # extract productions from three trees and induce the PCFG
- print("Induce PCFG grammar from treebank data:")
- productions = []
- item = treebank._fileids[0]
- for tree in treebank.parsed_sents(item)[:3]:
- # perform optional tree transformations, e.g.:
- tree.collapse_unary(collapsePOS=False)
- tree.chomsky_normal_form(horzMarkov=2)
- productions += tree.productions()
- S = Nonterminal('S')
- grammar = induce_pcfg(S, productions)
- print(grammar)
- print()
- print("Parse sentence using induced grammar:")
- parser = pchart.InsideChartParser(grammar)
- parser.trace(3)
- # doesn't work as tokens are different:
- # sent = treebank.tokenized('wsj_0001.mrg')[0]
- sent = treebank.parsed_sents(item)[0].leaves()
- print(sent)
- for parse in parser.parse(sent):
- print(parse)
- def fcfg_demo():
- import nltk.data
- g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
- print(g)
- print()
- def dg_demo():
- """
- A demonstration showing the creation and inspection of a
- ``DependencyGrammar``.
- """
- grammar = DependencyGrammar.fromstring(
- """
- 'scratch' -> 'cats' | 'walls'
- 'walls' -> 'the'
- 'cats' -> 'the'
- """
- )
- print(grammar)
- def sdg_demo():
- """
- A demonstration of how to read a string representation of
- a CoNLL format dependency tree.
- """
- from nltk.parse import DependencyGraph
- dg = DependencyGraph(
- """
- 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
- 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
- 3 met met Prep Prep voor 8 mod _ _
- 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
- 5 moeder moeder N N soort|ev|neut 3 obj1 _ _
- 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
- 7 gaan ga V V hulp|inf 6 vc _ _
- 8 winkelen winkel V V intrans|inf 11 cnj _ _
- 9 , , Punc Punc komma 8 punct _ _
- 10 zwemmen zwem V V intrans|inf 11 cnj _ _
- 11 of of Conj Conj neven 7 vc _ _
- 12 terrassen terras N N soort|mv|neut 11 cnj _ _
- 13 . . Punc Punc punt 12 punct _ _
- """
- )
- tree = dg.tree()
- print(tree.pprint())
- def demo():
- cfg_demo()
- pcfg_demo()
- fcfg_demo()
- dg_demo()
- sdg_demo()
- if __name__ == '__main__':
- demo()
- __all__ = [
- 'Nonterminal',
- 'nonterminals',
- 'CFG',
- 'Production',
- 'PCFG',
- 'ProbabilisticProduction',
- 'DependencyGrammar',
- 'DependencyProduction',
- 'ProbabilisticDependencyGrammar',
- 'induce_pcfg',
- 'read_grammar',
- ]
|