grammar.py 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Context Free Grammars
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Steven Bird <stevenbird1@gmail.com>
  6. # Edward Loper <edloper@gmail.com>
  7. # Jason Narad <jason.narad@gmail.com>
  8. # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. #
  12. """
  13. Basic data classes for representing context free grammars. A
  14. "grammar" specifies which trees can represent the structure of a
  15. given text. Each of these trees is called a "parse tree" for the
  16. text (or simply a "parse"). In a "context free" grammar, the set of
  17. parse trees for any piece of a text can depend only on that piece, and
  18. not on the rest of the text (i.e., the piece's context). Context free
  19. grammars are often used to find possible syntactic structures for
  20. sentences. In this context, the leaves of a parse tree are word
  21. tokens; and the node values are phrasal categories, such as ``NP``
  22. and ``VP``.
  23. The ``CFG`` class is used to encode context free grammars. Each
  24. ``CFG`` consists of a start symbol and a set of productions.
  25. The "start symbol" specifies the root node value for parse trees. For example,
  26. the start symbol for syntactic parsing is usually ``S``. Start
  27. symbols are encoded using the ``Nonterminal`` class, which is discussed
  28. below.
  29. A Grammar's "productions" specify what parent-child relationships a parse
  30. tree can contain. Each production specifies that a particular
  31. node can be the parent of a particular set of children. For example,
  32. the production ``<S> -> <NP> <VP>`` specifies that an ``S`` node can
  33. be the parent of an ``NP`` node and a ``VP`` node.
  34. Grammar productions are implemented by the ``Production`` class.
  35. Each ``Production`` consists of a left hand side and a right hand
  36. side. The "left hand side" is a ``Nonterminal`` that specifies the
  37. node type for a potential parent; and the "right hand side" is a list
  38. that specifies allowable children for that parent. This lists
  39. consists of ``Nonterminals`` and text types: each ``Nonterminal``
  40. indicates that the corresponding child may be a ``TreeToken`` with the
  41. specified node type; and each text type indicates that the
  42. corresponding child may be a ``Token`` with the with that type.
  43. The ``Nonterminal`` class is used to distinguish node values from leaf
  44. values. This prevents the grammar from accidentally using a leaf
  45. value (such as the English word "A") as the node of a subtree. Within
  46. a ``CFG``, all node values are wrapped in the ``Nonterminal``
  47. class. Note, however, that the trees that are specified by the grammar do
  48. *not* include these ``Nonterminal`` wrappers.
  49. Grammars can also be given a more procedural interpretation. According to
  50. this interpretation, a Grammar specifies any tree structure *tree* that
  51. can be produced by the following procedure:
  52. | Set tree to the start symbol
  53. | Repeat until tree contains no more nonterminal leaves:
  54. | Choose a production prod with whose left hand side
  55. | lhs is a nonterminal leaf of tree.
  56. | Replace the nonterminal leaf with a subtree, whose node
  57. | value is the value wrapped by the nonterminal lhs, and
  58. | whose children are the right hand side of prod.
  59. The operation of replacing the left hand side (*lhs*) of a production
  60. with the right hand side (*rhs*) in a tree (*tree*) is known as
  61. "expanding" *lhs* to *rhs* in *tree*.
  62. """
  63. from __future__ import print_function, unicode_literals, division
  64. import re
  65. from functools import total_ordering
  66. from six import string_types
  67. from nltk.util import transitive_closure, invert_graph
  68. from nltk.compat import python_2_unicode_compatible, unicode_repr
  69. from nltk.internals import raise_unorderable_types
  70. from nltk.probability import ImmutableProbabilisticMixIn
  71. from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE
  72. #################################################################
  73. # Nonterminal
  74. #################################################################
  75. @total_ordering
  76. @python_2_unicode_compatible
  77. class Nonterminal(object):
  78. """
  79. A non-terminal symbol for a context free grammar. ``Nonterminal``
  80. is a wrapper class for node values; it is used by ``Production``
  81. objects to distinguish node values from leaf values.
  82. The node value that is wrapped by a ``Nonterminal`` is known as its
  83. "symbol". Symbols are typically strings representing phrasal
  84. categories (such as ``"NP"`` or ``"VP"``). However, more complex
  85. symbol types are sometimes used (e.g., for lexicalized grammars).
  86. Since symbols are node values, they must be immutable and
  87. hashable. Two ``Nonterminals`` are considered equal if their
  88. symbols are equal.
  89. :see: ``CFG``, ``Production``
  90. :type _symbol: any
  91. :ivar _symbol: The node value corresponding to this
  92. ``Nonterminal``. This value must be immutable and hashable.
  93. """
  94. def __init__(self, symbol):
  95. """
  96. Construct a new non-terminal from the given symbol.
  97. :type symbol: any
  98. :param symbol: The node value corresponding to this
  99. ``Nonterminal``. This value must be immutable and
  100. hashable.
  101. """
  102. self._symbol = symbol
  103. self._hash = hash(symbol)
  104. def symbol(self):
  105. """
  106. Return the node value corresponding to this ``Nonterminal``.
  107. :rtype: (any)
  108. """
  109. return self._symbol
  110. def __eq__(self, other):
  111. """
  112. Return True if this non-terminal is equal to ``other``. In
  113. particular, return True if ``other`` is a ``Nonterminal``
  114. and this non-terminal's symbol is equal to ``other`` 's symbol.
  115. :rtype: bool
  116. """
  117. return type(self) == type(other) and self._symbol == other._symbol
  118. def __ne__(self, other):
  119. return not self == other
  120. def __lt__(self, other):
  121. if not isinstance(other, Nonterminal):
  122. raise_unorderable_types("<", self, other)
  123. return self._symbol < other._symbol
  124. def __hash__(self):
  125. return self._hash
  126. def __repr__(self):
  127. """
  128. Return a string representation for this ``Nonterminal``.
  129. :rtype: str
  130. """
  131. if isinstance(self._symbol, string_types):
  132. return '%s' % self._symbol
  133. else:
  134. return '%s' % unicode_repr(self._symbol)
  135. def __str__(self):
  136. """
  137. Return a string representation for this ``Nonterminal``.
  138. :rtype: str
  139. """
  140. if isinstance(self._symbol, string_types):
  141. return '%s' % self._symbol
  142. else:
  143. return '%s' % unicode_repr(self._symbol)
  144. def __div__(self, rhs):
  145. """
  146. Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
  147. the symbol for this nonterminal, and ``B`` is the symbol for rhs.
  148. :param rhs: The nonterminal used to form the right hand side
  149. of the new nonterminal.
  150. :type rhs: Nonterminal
  151. :rtype: Nonterminal
  152. """
  153. return Nonterminal('%s/%s' % (self._symbol, rhs._symbol))
  154. def __truediv__(self, rhs):
  155. """
  156. Return a new nonterminal whose symbol is ``A/B``, where ``A`` is
  157. the symbol for this nonterminal, and ``B`` is the symbol for rhs.
  158. This function allows use of the slash ``/`` operator with
  159. the future import of division.
  160. :param rhs: The nonterminal used to form the right hand side
  161. of the new nonterminal.
  162. :type rhs: Nonterminal
  163. :rtype: Nonterminal
  164. """
  165. return self.__div__(rhs)
  166. def nonterminals(symbols):
  167. """
  168. Given a string containing a list of symbol names, return a list of
  169. ``Nonterminals`` constructed from those symbols.
  170. :param symbols: The symbol name string. This string can be
  171. delimited by either spaces or commas.
  172. :type symbols: str
  173. :return: A list of ``Nonterminals`` constructed from the symbol
  174. names given in ``symbols``. The ``Nonterminals`` are sorted
  175. in the same order as the symbols names.
  176. :rtype: list(Nonterminal)
  177. """
  178. if ',' in symbols:
  179. symbol_list = symbols.split(',')
  180. else:
  181. symbol_list = symbols.split()
  182. return [Nonterminal(s.strip()) for s in symbol_list]
  183. class FeatStructNonterminal(FeatDict, Nonterminal):
  184. """A feature structure that's also a nonterminal. It acts as its
  185. own symbol, and automatically freezes itself when hashed."""
  186. def __hash__(self):
  187. self.freeze()
  188. return FeatStruct.__hash__(self)
  189. def symbol(self):
  190. return self
  191. def is_nonterminal(item):
  192. """
  193. :return: True if the item is a ``Nonterminal``.
  194. :rtype: bool
  195. """
  196. return isinstance(item, Nonterminal)
  197. #################################################################
  198. # Terminals
  199. #################################################################
  200. def is_terminal(item):
  201. """
  202. Return True if the item is a terminal, which currently is
  203. if it is hashable and not a ``Nonterminal``.
  204. :rtype: bool
  205. """
  206. return hasattr(item, '__hash__') and not isinstance(item, Nonterminal)
  207. #################################################################
  208. # Productions
  209. #################################################################
  210. @total_ordering
  211. @python_2_unicode_compatible
  212. class Production(object):
  213. """
  214. A grammar production. Each production maps a single symbol
  215. on the "left-hand side" to a sequence of symbols on the
  216. "right-hand side". (In the case of context-free productions,
  217. the left-hand side must be a ``Nonterminal``, and the right-hand
  218. side is a sequence of terminals and ``Nonterminals``.)
  219. "terminals" can be any immutable hashable object that is
  220. not a ``Nonterminal``. Typically, terminals are strings
  221. representing words, such as ``"dog"`` or ``"under"``.
  222. :see: ``CFG``
  223. :see: ``DependencyGrammar``
  224. :see: ``Nonterminal``
  225. :type _lhs: Nonterminal
  226. :ivar _lhs: The left-hand side of the production.
  227. :type _rhs: tuple(Nonterminal, terminal)
  228. :ivar _rhs: The right-hand side of the production.
  229. """
  230. def __init__(self, lhs, rhs):
  231. """
  232. Construct a new ``Production``.
  233. :param lhs: The left-hand side of the new ``Production``.
  234. :type lhs: Nonterminal
  235. :param rhs: The right-hand side of the new ``Production``.
  236. :type rhs: sequence(Nonterminal and terminal)
  237. """
  238. if isinstance(rhs, string_types):
  239. raise TypeError(
  240. 'production right hand side should be a list, ' 'not a string'
  241. )
  242. self._lhs = lhs
  243. self._rhs = tuple(rhs)
  244. self._hash = hash((self._lhs, self._rhs))
  245. def lhs(self):
  246. """
  247. Return the left-hand side of this ``Production``.
  248. :rtype: Nonterminal
  249. """
  250. return self._lhs
  251. def rhs(self):
  252. """
  253. Return the right-hand side of this ``Production``.
  254. :rtype: sequence(Nonterminal and terminal)
  255. """
  256. return self._rhs
  257. def __len__(self):
  258. """
  259. Return the length of the right-hand side.
  260. :rtype: int
  261. """
  262. return len(self._rhs)
  263. def is_nonlexical(self):
  264. """
  265. Return True if the right-hand side only contains ``Nonterminals``
  266. :rtype: bool
  267. """
  268. return all(is_nonterminal(n) for n in self._rhs)
  269. def is_lexical(self):
  270. """
  271. Return True if the right-hand contain at least one terminal token.
  272. :rtype: bool
  273. """
  274. return not self.is_nonlexical()
  275. def __str__(self):
  276. """
  277. Return a verbose string representation of the ``Production``.
  278. :rtype: str
  279. """
  280. result = '%s -> ' % unicode_repr(self._lhs)
  281. result += " ".join(unicode_repr(el) for el in self._rhs)
  282. return result
  283. def __repr__(self):
  284. """
  285. Return a concise string representation of the ``Production``.
  286. :rtype: str
  287. """
  288. return '%s' % self
  289. def __eq__(self, other):
  290. """
  291. Return True if this ``Production`` is equal to ``other``.
  292. :rtype: bool
  293. """
  294. return (
  295. type(self) == type(other)
  296. and self._lhs == other._lhs
  297. and self._rhs == other._rhs
  298. )
  299. def __ne__(self, other):
  300. return not self == other
  301. def __lt__(self, other):
  302. if not isinstance(other, Production):
  303. raise_unorderable_types("<", self, other)
  304. return (self._lhs, self._rhs) < (other._lhs, other._rhs)
  305. def __hash__(self):
  306. """
  307. Return a hash value for the ``Production``.
  308. :rtype: int
  309. """
  310. return self._hash
  311. @python_2_unicode_compatible
  312. class DependencyProduction(Production):
  313. """
  314. A dependency grammar production. Each production maps a single
  315. head word to an unordered list of one or more modifier words.
  316. """
  317. def __str__(self):
  318. """
  319. Return a verbose string representation of the ``DependencyProduction``.
  320. :rtype: str
  321. """
  322. result = '\'%s\' ->' % (self._lhs,)
  323. for elt in self._rhs:
  324. result += ' \'%s\'' % (elt,)
  325. return result
  326. @python_2_unicode_compatible
  327. class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn):
  328. """
  329. A probabilistic context free grammar production.
  330. A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that
  331. has an associated probability, which represents how likely it is that
  332. this production will be used. In particular, the probability of a
  333. ``ProbabilisticProduction`` records the likelihood that its right-hand side is
  334. the correct instantiation for any given occurrence of its left-hand side.
  335. :see: ``Production``
  336. """
  337. def __init__(self, lhs, rhs, **prob):
  338. """
  339. Construct a new ``ProbabilisticProduction``.
  340. :param lhs: The left-hand side of the new ``ProbabilisticProduction``.
  341. :type lhs: Nonterminal
  342. :param rhs: The right-hand side of the new ``ProbabilisticProduction``.
  343. :type rhs: sequence(Nonterminal and terminal)
  344. :param prob: Probability parameters of the new ``ProbabilisticProduction``.
  345. """
  346. ImmutableProbabilisticMixIn.__init__(self, **prob)
  347. Production.__init__(self, lhs, rhs)
  348. def __str__(self):
  349. return Production.__unicode__(self) + (
  350. ' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob()
  351. )
  352. def __eq__(self, other):
  353. return (
  354. type(self) == type(other)
  355. and self._lhs == other._lhs
  356. and self._rhs == other._rhs
  357. and self.prob() == other.prob()
  358. )
  359. def __ne__(self, other):
  360. return not self == other
  361. def __hash__(self):
  362. return hash((self._lhs, self._rhs, self.prob()))
  363. #################################################################
  364. # Grammars
  365. #################################################################
  366. @python_2_unicode_compatible
  367. class CFG(object):
  368. """
  369. A context-free grammar. A grammar consists of a start state and
  370. a set of productions. The set of terminals and nonterminals is
  371. implicitly specified by the productions.
  372. If you need efficient key-based access to productions, you
  373. can use a subclass to implement it.
  374. """
  375. def __init__(self, start, productions, calculate_leftcorners=True):
  376. """
  377. Create a new context-free grammar, from the given start state
  378. and set of ``Production``s.
  379. :param start: The start symbol
  380. :type start: Nonterminal
  381. :param productions: The list of productions that defines the grammar
  382. :type productions: list(Production)
  383. :param calculate_leftcorners: False if we don't want to calculate the
  384. leftcorner relation. In that case, some optimized chart parsers won't work.
  385. :type calculate_leftcorners: bool
  386. """
  387. if not is_nonterminal(start):
  388. raise TypeError(
  389. "start should be a Nonterminal object,"
  390. " not a %s" % type(start).__name__
  391. )
  392. self._start = start
  393. self._productions = productions
  394. self._categories = set(prod.lhs() for prod in productions)
  395. self._calculate_indexes()
  396. self._calculate_grammar_forms()
  397. if calculate_leftcorners:
  398. self._calculate_leftcorners()
  399. def _calculate_indexes(self):
  400. self._lhs_index = {}
  401. self._rhs_index = {}
  402. self._empty_index = {}
  403. self._lexical_index = {}
  404. for prod in self._productions:
  405. # Left hand side.
  406. lhs = prod._lhs
  407. if lhs not in self._lhs_index:
  408. self._lhs_index[lhs] = []
  409. self._lhs_index[lhs].append(prod)
  410. if prod._rhs:
  411. # First item in right hand side.
  412. rhs0 = prod._rhs[0]
  413. if rhs0 not in self._rhs_index:
  414. self._rhs_index[rhs0] = []
  415. self._rhs_index[rhs0].append(prod)
  416. else:
  417. # The right hand side is empty.
  418. self._empty_index[prod.lhs()] = prod
  419. # Lexical tokens in the right hand side.
  420. for token in prod._rhs:
  421. if is_terminal(token):
  422. self._lexical_index.setdefault(token, set()).add(prod)
  423. def _calculate_leftcorners(self):
  424. # Calculate leftcorner relations, for use in optimized parsing.
  425. self._immediate_leftcorner_categories = dict(
  426. (cat, set([cat])) for cat in self._categories
  427. )
  428. self._immediate_leftcorner_words = dict(
  429. (cat, set()) for cat in self._categories
  430. )
  431. for prod in self.productions():
  432. if len(prod) > 0:
  433. cat, left = prod.lhs(), prod.rhs()[0]
  434. if is_nonterminal(left):
  435. self._immediate_leftcorner_categories[cat].add(left)
  436. else:
  437. self._immediate_leftcorner_words[cat].add(left)
  438. lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True)
  439. self._leftcorners = lc
  440. self._leftcorner_parents = invert_graph(lc)
  441. nr_leftcorner_categories = sum(
  442. map(len, self._immediate_leftcorner_categories.values())
  443. )
  444. nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values()))
  445. if nr_leftcorner_words > nr_leftcorner_categories > 10000:
  446. # If the grammar is big, the leftcorner-word dictionary will be too large.
  447. # In that case it is better to calculate the relation on demand.
  448. self._leftcorner_words = None
  449. return
  450. self._leftcorner_words = {}
  451. for cat in self._leftcorners:
  452. lefts = self._leftcorners[cat]
  453. lc = self._leftcorner_words[cat] = set()
  454. for left in lefts:
  455. lc.update(self._immediate_leftcorner_words.get(left, set()))
  456. @classmethod
  457. def fromstring(cls, input, encoding=None):
  458. """
  459. Return the grammar instance corresponding to the input string(s).
  460. :param input: a grammar, either in the form of a string or as a list of strings.
  461. """
  462. start, productions = read_grammar(
  463. input, standard_nonterm_parser, encoding=encoding
  464. )
  465. return cls(start, productions)
  466. def start(self):
  467. """
  468. Return the start symbol of the grammar
  469. :rtype: Nonterminal
  470. """
  471. return self._start
  472. # tricky to balance readability and efficiency here!
  473. # can't use set operations as they don't preserve ordering
  474. def productions(self, lhs=None, rhs=None, empty=False):
  475. """
  476. Return the grammar productions, filtered by the left-hand side
  477. or the first item in the right-hand side.
  478. :param lhs: Only return productions with the given left-hand side.
  479. :param rhs: Only return productions with the given first item
  480. in the right-hand side.
  481. :param empty: Only return productions with an empty right-hand side.
  482. :return: A list of productions matching the given constraints.
  483. :rtype: list(Production)
  484. """
  485. if rhs and empty:
  486. raise ValueError(
  487. "You cannot select empty and non-empty " "productions at the same time."
  488. )
  489. # no constraints so return everything
  490. if not lhs and not rhs:
  491. if not empty:
  492. return self._productions
  493. else:
  494. return self._empty_index.values()
  495. # only lhs specified so look up its index
  496. elif lhs and not rhs:
  497. if not empty:
  498. return self._lhs_index.get(lhs, [])
  499. elif lhs in self._empty_index:
  500. return [self._empty_index[lhs]]
  501. else:
  502. return []
  503. # only rhs specified so look up its index
  504. elif rhs and not lhs:
  505. return self._rhs_index.get(rhs, [])
  506. # intersect
  507. else:
  508. return [
  509. prod
  510. for prod in self._lhs_index.get(lhs, [])
  511. if prod in self._rhs_index.get(rhs, [])
  512. ]
  513. def leftcorners(self, cat):
  514. """
  515. Return the set of all nonterminals that the given nonterminal
  516. can start with, including itself.
  517. This is the reflexive, transitive closure of the immediate
  518. leftcorner relation: (A > B) iff (A -> B beta)
  519. :param cat: the parent of the leftcorners
  520. :type cat: Nonterminal
  521. :return: the set of all leftcorners
  522. :rtype: set(Nonterminal)
  523. """
  524. return self._leftcorners.get(cat, set([cat]))
  525. def is_leftcorner(self, cat, left):
  526. """
  527. True if left is a leftcorner of cat, where left can be a
  528. terminal or a nonterminal.
  529. :param cat: the parent of the leftcorner
  530. :type cat: Nonterminal
  531. :param left: the suggested leftcorner
  532. :type left: Terminal or Nonterminal
  533. :rtype: bool
  534. """
  535. if is_nonterminal(left):
  536. return left in self.leftcorners(cat)
  537. elif self._leftcorner_words:
  538. return left in self._leftcorner_words.get(cat, set())
  539. else:
  540. return any(
  541. left in self._immediate_leftcorner_words.get(parent, set())
  542. for parent in self.leftcorners(cat)
  543. )
  544. def leftcorner_parents(self, cat):
  545. """
  546. Return the set of all nonterminals for which the given category
  547. is a left corner. This is the inverse of the leftcorner relation.
  548. :param cat: the suggested leftcorner
  549. :type cat: Nonterminal
  550. :return: the set of all parents to the leftcorner
  551. :rtype: set(Nonterminal)
  552. """
  553. return self._leftcorner_parents.get(cat, set([cat]))
  554. def check_coverage(self, tokens):
  555. """
  556. Check whether the grammar rules cover the given list of tokens.
  557. If not, then raise an exception.
  558. :type tokens: list(str)
  559. """
  560. missing = [tok for tok in tokens if not self._lexical_index.get(tok)]
  561. if missing:
  562. missing = ', '.join('%r' % (w,) for w in missing)
  563. raise ValueError(
  564. "Grammar does not cover some of the " "input words: %r." % missing
  565. )
  566. def _calculate_grammar_forms(self):
  567. """
  568. Pre-calculate of which form(s) the grammar is.
  569. """
  570. prods = self._productions
  571. self._is_lexical = all(p.is_lexical() for p in prods)
  572. self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1)
  573. self._min_len = min(len(p) for p in prods)
  574. self._max_len = max(len(p) for p in prods)
  575. self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1)
  576. def is_lexical(self):
  577. """
  578. Return True if all productions are lexicalised.
  579. """
  580. return self._is_lexical
  581. def is_nonlexical(self):
  582. """
  583. Return True if all lexical rules are "preterminals", that is,
  584. unary rules which can be separated in a preprocessing step.
  585. This means that all productions are of the forms
  586. A -> B1 ... Bn (n>=0), or A -> "s".
  587. Note: is_lexical() and is_nonlexical() are not opposites.
  588. There are grammars which are neither, and grammars which are both.
  589. """
  590. return self._is_nonlexical
  591. def min_len(self):
  592. """
  593. Return the right-hand side length of the shortest grammar production.
  594. """
  595. return self._min_len
  596. def max_len(self):
  597. """
  598. Return the right-hand side length of the longest grammar production.
  599. """
  600. return self._max_len
  601. def is_nonempty(self):
  602. """
  603. Return True if there are no empty productions.
  604. """
  605. return self._min_len > 0
  606. def is_binarised(self):
  607. """
  608. Return True if all productions are at most binary.
  609. Note that there can still be empty and unary productions.
  610. """
  611. return self._max_len <= 2
  612. def is_flexible_chomsky_normal_form(self):
  613. """
  614. Return True if all productions are of the forms
  615. A -> B C, A -> B, or A -> "s".
  616. """
  617. return self.is_nonempty() and self.is_nonlexical() and self.is_binarised()
  618. def is_chomsky_normal_form(self):
  619. """
  620. Return True if the grammar is of Chomsky Normal Form, i.e. all productions
  621. are of the form A -> B C, or A -> "s".
  622. """
  623. return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical
  624. def chomsky_normal_form(self, new_token_padding='@$@', flexible=False):
  625. """
  626. Returns a new Grammer that is in chomsky normal
  627. :param: new_token_padding
  628. Customise new rule formation during binarisation
  629. """
  630. if self.is_chomsky_normal_form():
  631. return
  632. if self.productions(empty=True):
  633. raise ValueError(('Grammar has Empty rules. '
  634. 'Cannot deal with them at the moment'))
  635. # check for mixed rules
  636. for rule in self.productions():
  637. if rule.is_lexical() and len(rule.rhs()) > 1:
  638. raise ValueError(
  639. 'Cannot handled mixed rule {} => {}'.format(rule.lhs(),
  640. rule.rhs()))
  641. step1 = CFG.eliminate_start(self)
  642. step2 = CFG.binarize(step1, new_token_padding)
  643. if flexible:
  644. return step2
  645. step3 = CFG.remove_unitary_rules(step2)
  646. return step3
  647. @classmethod
  648. def remove_unitary_rules(cls, grammar):
  649. """
  650. Remove nonlexical unitary rules and convert them to
  651. lexical
  652. """
  653. result = []
  654. unitary = []
  655. for rule in grammar.productions():
  656. if len(rule) == 1 and rule.is_nonlexical():
  657. unitary.append(rule)
  658. else:
  659. result.append(rule)
  660. while unitary:
  661. rule = unitary.pop(0)
  662. for item in grammar.productions(lhs=rule.rhs()[0]):
  663. new_rule = Production(rule.lhs(), item.rhs())
  664. if len(new_rule) != 1 or new_rule.is_lexical():
  665. result.append(new_rule)
  666. else:
  667. unitary.append(new_rule)
  668. n_grammar = CFG(grammar.start(), result)
  669. return n_grammar
  670. @classmethod
  671. def binarize(cls, grammar, padding='@$@'):
  672. """
  673. Convert all non-binary rules into binary by introducing
  674. new tokens.
  675. Example::
  676. Original:
  677. A => B C D
  678. After Conversion:
  679. A => B A@$@B
  680. A@$@B => C D
  681. """
  682. result = []
  683. for rule in grammar.productions():
  684. if len(rule.rhs()) > 2:
  685. # this rule needs to be broken down
  686. left_side = rule.lhs()
  687. for k in range(0, len(rule.rhs()) - 2):
  688. tsym = rule.rhs()[k]
  689. new_sym = Nonterminal(
  690. left_side.symbol() + padding + tsym.symbol()
  691. )
  692. new_production = Production(left_side, (tsym, new_sym))
  693. left_side = new_sym
  694. result.append(new_production)
  695. last_prd = Production(left_side, rule.rhs()[-2:])
  696. result.append(last_prd)
  697. else:
  698. result.append(rule)
  699. n_grammar = CFG(grammar.start(), result)
  700. return n_grammar
  701. @classmethod
  702. def eliminate_start(cls, grammar):
  703. """
  704. Eliminate start rule in case it appears on RHS
  705. Example: S -> S0 S1 and S0 -> S1 S
  706. Then another rule S0_Sigma -> S is added
  707. """
  708. start = grammar.start()
  709. result = []
  710. need_to_add = None
  711. for rule in grammar.productions():
  712. if start in rule.rhs():
  713. need_to_add = True
  714. result.append(rule)
  715. if need_to_add:
  716. start = Nonterminal('S0_SIGMA')
  717. result.append(Production(start, grammar.start()))
  718. n_grammar = CFG(start, result)
  719. return n_grammar
  720. return grammar
  721. def __repr__(self):
  722. return '<Grammar with %d productions>' % len(self._productions)
  723. def __str__(self):
  724. result = 'Grammar with %d productions' % len(self._productions)
  725. result += ' (start state = %r)' % self._start
  726. for production in self._productions:
  727. result += '\n %s' % production
  728. return result
  729. class FeatureGrammar(CFG):
  730. """
  731. A feature-based grammar. This is equivalent to a
  732. ``CFG`` whose nonterminals are all
  733. ``FeatStructNonterminal``.
  734. A grammar consists of a start state and a set of
  735. productions. The set of terminals and nonterminals
  736. is implicitly specified by the productions.
  737. """
  738. def __init__(self, start, productions):
  739. """
  740. Create a new feature-based grammar, from the given start
  741. state and set of ``Productions``.
  742. :param start: The start symbol
  743. :type start: FeatStructNonterminal
  744. :param productions: The list of productions that defines the grammar
  745. :type productions: list(Production)
  746. """
  747. CFG.__init__(self, start, productions)
  748. # The difference with CFG is that the productions are
  749. # indexed on the TYPE feature of the nonterminals.
  750. # This is calculated by the method _get_type_if_possible().
  751. def _calculate_indexes(self):
  752. self._lhs_index = {}
  753. self._rhs_index = {}
  754. self._empty_index = {}
  755. self._empty_productions = []
  756. self._lexical_index = {}
  757. for prod in self._productions:
  758. # Left hand side.
  759. lhs = self._get_type_if_possible(prod._lhs)
  760. if lhs not in self._lhs_index:
  761. self._lhs_index[lhs] = []
  762. self._lhs_index[lhs].append(prod)
  763. if prod._rhs:
  764. # First item in right hand side.
  765. rhs0 = self._get_type_if_possible(prod._rhs[0])
  766. if rhs0 not in self._rhs_index:
  767. self._rhs_index[rhs0] = []
  768. self._rhs_index[rhs0].append(prod)
  769. else:
  770. # The right hand side is empty.
  771. if lhs not in self._empty_index:
  772. self._empty_index[lhs] = []
  773. self._empty_index[lhs].append(prod)
  774. self._empty_productions.append(prod)
  775. # Lexical tokens in the right hand side.
  776. for token in prod._rhs:
  777. if is_terminal(token):
  778. self._lexical_index.setdefault(token, set()).add(prod)
  779. @classmethod
  780. def fromstring(
  781. cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None
  782. ):
  783. """
  784. Return a feature structure based grammar.
  785. :param input: a grammar, either in the form of a string or else
  786. as a list of strings.
  787. :param features: a tuple of features (default: SLASH, TYPE)
  788. :param logic_parser: a parser for lambda-expressions,
  789. by default, ``LogicParser()``
  790. :param fstruct_reader: a feature structure parser
  791. (only if features and logic_parser is None)
  792. """
  793. if features is None:
  794. features = (SLASH, TYPE)
  795. if fstruct_reader is None:
  796. fstruct_reader = FeatStructReader(
  797. features, FeatStructNonterminal, logic_parser=logic_parser
  798. )
  799. elif logic_parser is not None:
  800. raise Exception(
  801. '\'logic_parser\' and \'fstruct_reader\' must ' 'not both be set'
  802. )
  803. start, productions = read_grammar(
  804. input, fstruct_reader.read_partial, encoding=encoding
  805. )
  806. return cls(start, productions)
  807. def productions(self, lhs=None, rhs=None, empty=False):
  808. """
  809. Return the grammar productions, filtered by the left-hand side
  810. or the first item in the right-hand side.
  811. :param lhs: Only return productions with the given left-hand side.
  812. :param rhs: Only return productions with the given first item
  813. in the right-hand side.
  814. :param empty: Only return productions with an empty right-hand side.
  815. :rtype: list(Production)
  816. """
  817. if rhs and empty:
  818. raise ValueError(
  819. "You cannot select empty and non-empty " "productions at the same time."
  820. )
  821. # no constraints so return everything
  822. if not lhs and not rhs:
  823. if empty:
  824. return self._empty_productions
  825. else:
  826. return self._productions
  827. # only lhs specified so look up its index
  828. elif lhs and not rhs:
  829. if empty:
  830. return self._empty_index.get(self._get_type_if_possible(lhs), [])
  831. else:
  832. return self._lhs_index.get(self._get_type_if_possible(lhs), [])
  833. # only rhs specified so look up its index
  834. elif rhs and not lhs:
  835. return self._rhs_index.get(self._get_type_if_possible(rhs), [])
  836. # intersect
  837. else:
  838. return [
  839. prod
  840. for prod in self._lhs_index.get(self._get_type_if_possible(lhs), [])
  841. if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])
  842. ]
  843. def leftcorners(self, cat):
  844. """
  845. Return the set of all words that the given category can start with.
  846. Also called the "first set" in compiler construction.
  847. """
  848. raise NotImplementedError("Not implemented yet")
  849. def leftcorner_parents(self, cat):
  850. """
  851. Return the set of all categories for which the given category
  852. is a left corner.
  853. """
  854. raise NotImplementedError("Not implemented yet")
  855. def _get_type_if_possible(self, item):
  856. """
  857. Helper function which returns the ``TYPE`` feature of the ``item``,
  858. if it exists, otherwise it returns the ``item`` itself
  859. """
  860. if isinstance(item, dict) and TYPE in item:
  861. return FeatureValueType(item[TYPE])
  862. else:
  863. return item
  864. @total_ordering
  865. @python_2_unicode_compatible
  866. class FeatureValueType(object):
  867. """
  868. A helper class for ``FeatureGrammars``, designed to be different
  869. from ordinary strings. This is to stop the ``FeatStruct``
  870. ``FOO[]`` from being compare equal to the terminal "FOO".
  871. """
  872. def __init__(self, value):
  873. self._value = value
  874. self._hash = hash(value)
  875. def __repr__(self):
  876. return '<%s>' % self._value
  877. def __eq__(self, other):
  878. return type(self) == type(other) and self._value == other._value
  879. def __ne__(self, other):
  880. return not self == other
  881. def __lt__(self, other):
  882. if not isinstance(other, FeatureValueType):
  883. raise_unorderable_types("<", self, other)
  884. return self._value < other._value
  885. def __hash__(self):
  886. return self._hash
  887. @python_2_unicode_compatible
  888. class DependencyGrammar(object):
  889. """
  890. A dependency grammar. A DependencyGrammar consists of a set of
  891. productions. Each production specifies a head/modifier relationship
  892. between a pair of words.
  893. """
  894. def __init__(self, productions):
  895. """
  896. Create a new dependency grammar, from the set of ``Productions``.
  897. :param productions: The list of productions that defines the grammar
  898. :type productions: list(Production)
  899. """
  900. self._productions = productions
  901. @classmethod
  902. def fromstring(cls, input):
  903. productions = []
  904. for linenum, line in enumerate(input.split('\n')):
  905. line = line.strip()
  906. if line.startswith('#') or line == '':
  907. continue
  908. try:
  909. productions += _read_dependency_production(line)
  910. except ValueError:
  911. raise ValueError('Unable to parse line %s: %s' % (linenum, line))
  912. if len(productions) == 0:
  913. raise ValueError('No productions found!')
  914. return cls(productions)
  915. def contains(self, head, mod):
  916. """
  917. :param head: A head word.
  918. :type head: str
  919. :param mod: A mod word, to test as a modifier of 'head'.
  920. :type mod: str
  921. :return: true if this ``DependencyGrammar`` contains a
  922. ``DependencyProduction`` mapping 'head' to 'mod'.
  923. :rtype: bool
  924. """
  925. for production in self._productions:
  926. for possibleMod in production._rhs:
  927. if production._lhs == head and possibleMod == mod:
  928. return True
  929. return False
  930. def __contains__(self, head, mod):
  931. """
  932. Return True if this ``DependencyGrammar`` contains a
  933. ``DependencyProduction`` mapping 'head' to 'mod'.
  934. :param head: A head word.
  935. :type head: str
  936. :param mod: A mod word, to test as a modifier of 'head'.
  937. :type mod: str
  938. :rtype: bool
  939. """
  940. for production in self._productions:
  941. for possibleMod in production._rhs:
  942. if production._lhs == head and possibleMod == mod:
  943. return True
  944. return False
  945. # # should be rewritten, the set comp won't work in all comparisons
  946. # def contains_exactly(self, head, modlist):
  947. # for production in self._productions:
  948. # if(len(production._rhs) == len(modlist)):
  949. # if(production._lhs == head):
  950. # set1 = Set(production._rhs)
  951. # set2 = Set(modlist)
  952. # if(set1 == set2):
  953. # return True
  954. # return False
  955. def __str__(self):
  956. """
  957. Return a verbose string representation of the ``DependencyGrammar``
  958. :rtype: str
  959. """
  960. str = 'Dependency grammar with %d productions' % len(self._productions)
  961. for production in self._productions:
  962. str += '\n %s' % production
  963. return str
  964. def __repr__(self):
  965. """
  966. Return a concise string representation of the ``DependencyGrammar``
  967. """
  968. return 'Dependency grammar with %d productions' % len(self._productions)
  969. @python_2_unicode_compatible
  970. class ProbabilisticDependencyGrammar(object):
  971. """
  972. """
  973. def __init__(self, productions, events, tags):
  974. self._productions = productions
  975. self._events = events
  976. self._tags = tags
  977. def contains(self, head, mod):
  978. """
  979. Return True if this ``DependencyGrammar`` contains a
  980. ``DependencyProduction`` mapping 'head' to 'mod'.
  981. :param head: A head word.
  982. :type head: str
  983. :param mod: A mod word, to test as a modifier of 'head'.
  984. :type mod: str
  985. :rtype: bool
  986. """
  987. for production in self._productions:
  988. for possibleMod in production._rhs:
  989. if production._lhs == head and possibleMod == mod:
  990. return True
  991. return False
  992. def __str__(self):
  993. """
  994. Return a verbose string representation of the ``ProbabilisticDependencyGrammar``
  995. :rtype: str
  996. """
  997. str = 'Statistical dependency grammar with %d productions' % len(
  998. self._productions
  999. )
  1000. for production in self._productions:
  1001. str += '\n %s' % production
  1002. str += '\nEvents:'
  1003. for event in self._events:
  1004. str += '\n %d:%s' % (self._events[event], event)
  1005. str += '\nTags:'
  1006. for tag_word in self._tags:
  1007. str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word])
  1008. return str
  1009. def __repr__(self):
  1010. """
  1011. Return a concise string representation of the ``ProbabilisticDependencyGrammar``
  1012. """
  1013. return 'Statistical Dependency grammar with %d productions' % len(
  1014. self._productions
  1015. )
  1016. class PCFG(CFG):
  1017. """
  1018. A probabilistic context-free grammar. A PCFG consists of a
  1019. start state and a set of productions with probabilities. The set of
  1020. terminals and nonterminals is implicitly specified by the productions.
  1021. PCFG productions use the ``ProbabilisticProduction`` class.
  1022. ``PCFGs`` impose the constraint that the set of productions with
  1023. any given left-hand-side must have probabilities that sum to 1
  1024. (allowing for a small margin of error).
  1025. If you need efficient key-based access to productions, you can use
  1026. a subclass to implement it.
  1027. :type EPSILON: float
  1028. :cvar EPSILON: The acceptable margin of error for checking that
  1029. productions with a given left-hand side have probabilities
  1030. that sum to 1.
  1031. """
  1032. EPSILON = 0.01
  1033. def __init__(self, start, productions, calculate_leftcorners=True):
  1034. """
  1035. Create a new context-free grammar, from the given start state
  1036. and set of ``ProbabilisticProductions``.
  1037. :param start: The start symbol
  1038. :type start: Nonterminal
  1039. :param productions: The list of productions that defines the grammar
  1040. :type productions: list(Production)
  1041. :raise ValueError: if the set of productions with any left-hand-side
  1042. do not have probabilities that sum to a value within
  1043. EPSILON of 1.
  1044. :param calculate_leftcorners: False if we don't want to calculate the
  1045. leftcorner relation. In that case, some optimized chart parsers won't work.
  1046. :type calculate_leftcorners: bool
  1047. """
  1048. CFG.__init__(self, start, productions, calculate_leftcorners)
  1049. # Make sure that the probabilities sum to one.
  1050. probs = {}
  1051. for production in productions:
  1052. probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob()
  1053. for (lhs, p) in probs.items():
  1054. if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)):
  1055. raise ValueError("Productions for %r do not sum to 1" % lhs)
  1056. @classmethod
  1057. def fromstring(cls, input, encoding=None):
  1058. """
  1059. Return a probabilistic context-free grammar corresponding to the
  1060. input string(s).
  1061. :param input: a grammar, either in the form of a string or else
  1062. as a list of strings.
  1063. """
  1064. start, productions = read_grammar(
  1065. input, standard_nonterm_parser, probabilistic=True, encoding=encoding
  1066. )
  1067. return cls(start, productions)
  1068. #################################################################
  1069. # Inducing Grammars
  1070. #################################################################
  1071. # Contributed by Nathan Bodenstab <bodenstab@cslu.ogi.edu>
  1072. def induce_pcfg(start, productions):
  1073. """
  1074. Induce a PCFG grammar from a list of productions.
  1075. The probability of a production A -> B C in a PCFG is:
  1076. | count(A -> B C)
  1077. | P(B, C | A) = --------------- where \* is any right hand side
  1078. | count(A -> \*)
  1079. :param start: The start symbol
  1080. :type start: Nonterminal
  1081. :param productions: The list of productions that defines the grammar
  1082. :type productions: list(Production)
  1083. """
  1084. # Production count: the number of times a given production occurs
  1085. pcount = {}
  1086. # LHS-count: counts the number of times a given lhs occurs
  1087. lcount = {}
  1088. for prod in productions:
  1089. lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1
  1090. pcount[prod] = pcount.get(prod, 0) + 1
  1091. prods = [
  1092. ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()])
  1093. for p in pcount
  1094. ]
  1095. return PCFG(start, prods)
  1096. #################################################################
  1097. # Helper functions for reading productions
  1098. #################################################################
  1099. def _read_cfg_production(input):
  1100. """
  1101. Return a list of context-free ``Productions``.
  1102. """
  1103. return _read_production(input, standard_nonterm_parser)
  1104. def _read_pcfg_production(input):
  1105. """
  1106. Return a list of PCFG ``ProbabilisticProductions``.
  1107. """
  1108. return _read_production(input, standard_nonterm_parser, probabilistic=True)
  1109. def _read_fcfg_production(input, fstruct_reader):
  1110. """
  1111. Return a list of feature-based ``Productions``.
  1112. """
  1113. return _read_production(input, fstruct_reader)
  1114. # Parsing generic grammars
  1115. _ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE)
  1116. _PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE)
  1117. _TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE)
  1118. _DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE)
  1119. def _read_production(line, nonterm_parser, probabilistic=False):
  1120. """
  1121. Parse a grammar rule, given as a string, and return
  1122. a list of productions.
  1123. """
  1124. pos = 0
  1125. # Parse the left-hand side.
  1126. lhs, pos = nonterm_parser(line, pos)
  1127. # Skip over the arrow.
  1128. m = _ARROW_RE.match(line, pos)
  1129. if not m:
  1130. raise ValueError('Expected an arrow')
  1131. pos = m.end()
  1132. # Parse the right hand side.
  1133. probabilities = [0.0]
  1134. rhsides = [[]]
  1135. while pos < len(line):
  1136. # Probability.
  1137. m = _PROBABILITY_RE.match(line, pos)
  1138. if probabilistic and m:
  1139. pos = m.end()
  1140. probabilities[-1] = float(m.group(1)[1:-1])
  1141. if probabilities[-1] > 1.0:
  1142. raise ValueError(
  1143. 'Production probability %f, '
  1144. 'should not be greater than 1.0' % (probabilities[-1],)
  1145. )
  1146. # String -- add terminal.
  1147. elif line[pos] in "\'\"":
  1148. m = _TERMINAL_RE.match(line, pos)
  1149. if not m:
  1150. raise ValueError('Unterminated string')
  1151. rhsides[-1].append(m.group(1)[1:-1])
  1152. pos = m.end()
  1153. # Vertical bar -- start new rhside.
  1154. elif line[pos] == '|':
  1155. m = _DISJUNCTION_RE.match(line, pos)
  1156. probabilities.append(0.0)
  1157. rhsides.append([])
  1158. pos = m.end()
  1159. # Anything else -- nonterminal.
  1160. else:
  1161. nonterm, pos = nonterm_parser(line, pos)
  1162. rhsides[-1].append(nonterm)
  1163. if probabilistic:
  1164. return [
  1165. ProbabilisticProduction(lhs, rhs, prob=probability)
  1166. for (rhs, probability) in zip(rhsides, probabilities)
  1167. ]
  1168. else:
  1169. return [Production(lhs, rhs) for rhs in rhsides]
  1170. #################################################################
  1171. # Reading Phrase Structure Grammars
  1172. #################################################################
  1173. def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None):
  1174. """
  1175. Return a pair consisting of a starting category and a list of
  1176. ``Productions``.
  1177. :param input: a grammar, either in the form of a string or else
  1178. as a list of strings.
  1179. :param nonterm_parser: a function for parsing nonterminals.
  1180. It should take a ``(string, position)`` as argument and
  1181. return a ``(nonterminal, position)`` as result.
  1182. :param probabilistic: are the grammar rules probabilistic?
  1183. :type probabilistic: bool
  1184. :param encoding: the encoding of the grammar, if it is a binary string
  1185. :type encoding: str
  1186. """
  1187. if encoding is not None:
  1188. input = input.decode(encoding)
  1189. if isinstance(input, string_types):
  1190. lines = input.split('\n')
  1191. else:
  1192. lines = input
  1193. start = None
  1194. productions = []
  1195. continue_line = ''
  1196. for linenum, line in enumerate(lines):
  1197. line = continue_line + line.strip()
  1198. if line.startswith('#') or line == '':
  1199. continue
  1200. if line.endswith('\\'):
  1201. continue_line = line[:-1].rstrip() + ' '
  1202. continue
  1203. continue_line = ''
  1204. try:
  1205. if line[0] == '%':
  1206. directive, args = line[1:].split(None, 1)
  1207. if directive == 'start':
  1208. start, pos = nonterm_parser(args, 0)
  1209. if pos != len(args):
  1210. raise ValueError('Bad argument to start directive')
  1211. else:
  1212. raise ValueError('Bad directive')
  1213. else:
  1214. # expand out the disjunctions on the RHS
  1215. productions += _read_production(line, nonterm_parser, probabilistic)
  1216. except ValueError as e:
  1217. raise ValueError('Unable to parse line %s: %s\n%s' % (linenum + 1, line, e))
  1218. if not productions:
  1219. raise ValueError('No productions found!')
  1220. if not start:
  1221. start = productions[0].lhs()
  1222. return (start, productions)
  1223. _STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE)
  1224. def standard_nonterm_parser(string, pos):
  1225. m = _STANDARD_NONTERM_RE.match(string, pos)
  1226. if not m:
  1227. raise ValueError('Expected a nonterminal, found: ' + string[pos:])
  1228. return (Nonterminal(m.group(1)), m.end())
  1229. #################################################################
  1230. # Reading Dependency Grammars
  1231. #################################################################
  1232. _READ_DG_RE = re.compile(
  1233. r'''^\s* # leading whitespace
  1234. ('[^']+')\s* # single-quoted lhs
  1235. (?:[-=]+>)\s* # arrow
  1236. (?:( # rhs:
  1237. "[^"]+" # doubled-quoted terminal
  1238. | '[^']+' # single-quoted terminal
  1239. | \| # disjunction
  1240. )
  1241. \s*) # trailing space
  1242. *$''', # zero or more copies
  1243. re.VERBOSE,
  1244. )
  1245. _SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''')
  1246. def _read_dependency_production(s):
  1247. if not _READ_DG_RE.match(s):
  1248. raise ValueError('Bad production string')
  1249. pieces = _SPLIT_DG_RE.split(s)
  1250. pieces = [p for i, p in enumerate(pieces) if i % 2 == 1]
  1251. lhside = pieces[0].strip('\'\"')
  1252. rhsides = [[]]
  1253. for piece in pieces[2:]:
  1254. if piece == '|':
  1255. rhsides.append([])
  1256. else:
  1257. rhsides[-1].append(piece.strip('\'\"'))
  1258. return [DependencyProduction(lhside, rhside) for rhside in rhsides]
  1259. #################################################################
  1260. # Demonstration
  1261. #################################################################
  1262. def cfg_demo():
  1263. """
  1264. A demonstration showing how ``CFGs`` can be created and used.
  1265. """
  1266. from nltk import nonterminals, Production, CFG
  1267. # Create some nonterminals
  1268. S, NP, VP, PP = nonterminals('S, NP, VP, PP')
  1269. N, V, P, Det = nonterminals('N, V, P, Det')
  1270. VP_slash_NP = VP / NP
  1271. print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP / NP])
  1272. print(' S.symbol() =>', repr(S.symbol()))
  1273. print()
  1274. print(Production(S, [NP]))
  1275. # Create some Grammar Productions
  1276. grammar = CFG.fromstring(
  1277. """
  1278. S -> NP VP
  1279. PP -> P NP
  1280. NP -> Det N | NP PP
  1281. VP -> V NP | VP PP
  1282. Det -> 'a' | 'the'
  1283. N -> 'dog' | 'cat'
  1284. V -> 'chased' | 'sat'
  1285. P -> 'on' | 'in'
  1286. """
  1287. )
  1288. print('A Grammar:', repr(grammar))
  1289. print(' grammar.start() =>', repr(grammar.start()))
  1290. print(' grammar.productions() =>', end=' ')
  1291. # Use string.replace(...) is to line-wrap the output.
  1292. print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 25))
  1293. print()
  1294. toy_pcfg1 = PCFG.fromstring(
  1295. """
  1296. S -> NP VP [1.0]
  1297. NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
  1298. Det -> 'the' [0.8] | 'my' [0.2]
  1299. N -> 'man' [0.5] | 'telescope' [0.5]
  1300. VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
  1301. V -> 'ate' [0.35] | 'saw' [0.65]
  1302. PP -> P NP [1.0]
  1303. P -> 'with' [0.61] | 'under' [0.39]
  1304. """
  1305. )
  1306. toy_pcfg2 = PCFG.fromstring(
  1307. """
  1308. S -> NP VP [1.0]
  1309. VP -> V NP [.59]
  1310. VP -> V [.40]
  1311. VP -> VP PP [.01]
  1312. NP -> Det N [.41]
  1313. NP -> Name [.28]
  1314. NP -> NP PP [.31]
  1315. PP -> P NP [1.0]
  1316. V -> 'saw' [.21]
  1317. V -> 'ate' [.51]
  1318. V -> 'ran' [.28]
  1319. N -> 'boy' [.11]
  1320. N -> 'cookie' [.12]
  1321. N -> 'table' [.13]
  1322. N -> 'telescope' [.14]
  1323. N -> 'hill' [.5]
  1324. Name -> 'Jack' [.52]
  1325. Name -> 'Bob' [.48]
  1326. P -> 'with' [.61]
  1327. P -> 'under' [.39]
  1328. Det -> 'the' [.41]
  1329. Det -> 'a' [.31]
  1330. Det -> 'my' [.28]
  1331. """
  1332. )
  1333. def pcfg_demo():
  1334. """
  1335. A demonstration showing how a ``PCFG`` can be created and used.
  1336. """
  1337. from nltk.corpus import treebank
  1338. from nltk import treetransforms
  1339. from nltk import induce_pcfg
  1340. from nltk.parse import pchart
  1341. pcfg_prods = toy_pcfg1.productions()
  1342. pcfg_prod = pcfg_prods[2]
  1343. print('A PCFG production:', repr(pcfg_prod))
  1344. print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs()))
  1345. print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs()))
  1346. print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
  1347. print()
  1348. grammar = toy_pcfg2
  1349. print('A PCFG grammar:', repr(grammar))
  1350. print(' grammar.start() =>', repr(grammar.start()))
  1351. print(' grammar.productions() =>', end=' ')
  1352. # Use .replace(...) is to line-wrap the output.
  1353. print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
  1354. print()
  1355. # extract productions from three trees and induce the PCFG
  1356. print("Induce PCFG grammar from treebank data:")
  1357. productions = []
  1358. item = treebank._fileids[0]
  1359. for tree in treebank.parsed_sents(item)[:3]:
  1360. # perform optional tree transformations, e.g.:
  1361. tree.collapse_unary(collapsePOS=False)
  1362. tree.chomsky_normal_form(horzMarkov=2)
  1363. productions += tree.productions()
  1364. S = Nonterminal('S')
  1365. grammar = induce_pcfg(S, productions)
  1366. print(grammar)
  1367. print()
  1368. print("Parse sentence using induced grammar:")
  1369. parser = pchart.InsideChartParser(grammar)
  1370. parser.trace(3)
  1371. # doesn't work as tokens are different:
  1372. # sent = treebank.tokenized('wsj_0001.mrg')[0]
  1373. sent = treebank.parsed_sents(item)[0].leaves()
  1374. print(sent)
  1375. for parse in parser.parse(sent):
  1376. print(parse)
  1377. def fcfg_demo():
  1378. import nltk.data
  1379. g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
  1380. print(g)
  1381. print()
  1382. def dg_demo():
  1383. """
  1384. A demonstration showing the creation and inspection of a
  1385. ``DependencyGrammar``.
  1386. """
  1387. grammar = DependencyGrammar.fromstring(
  1388. """
  1389. 'scratch' -> 'cats' | 'walls'
  1390. 'walls' -> 'the'
  1391. 'cats' -> 'the'
  1392. """
  1393. )
  1394. print(grammar)
  1395. def sdg_demo():
  1396. """
  1397. A demonstration of how to read a string representation of
  1398. a CoNLL format dependency tree.
  1399. """
  1400. from nltk.parse import DependencyGraph
  1401. dg = DependencyGraph(
  1402. """
  1403. 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _
  1404. 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _
  1405. 3 met met Prep Prep voor 8 mod _ _
  1406. 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _
  1407. 5 moeder moeder N N soort|ev|neut 3 obj1 _ _
  1408. 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _
  1409. 7 gaan ga V V hulp|inf 6 vc _ _
  1410. 8 winkelen winkel V V intrans|inf 11 cnj _ _
  1411. 9 , , Punc Punc komma 8 punct _ _
  1412. 10 zwemmen zwem V V intrans|inf 11 cnj _ _
  1413. 11 of of Conj Conj neven 7 vc _ _
  1414. 12 terrassen terras N N soort|mv|neut 11 cnj _ _
  1415. 13 . . Punc Punc punt 12 punct _ _
  1416. """
  1417. )
  1418. tree = dg.tree()
  1419. print(tree.pprint())
  1420. def demo():
  1421. cfg_demo()
  1422. pcfg_demo()
  1423. fcfg_demo()
  1424. dg_demo()
  1425. sdg_demo()
  1426. if __name__ == '__main__':
  1427. demo()
  1428. __all__ = [
  1429. 'Nonterminal',
  1430. 'nonterminals',
  1431. 'CFG',
  1432. 'Production',
  1433. 'PCFG',
  1434. 'ProbabilisticProduction',
  1435. 'DependencyGrammar',
  1436. 'DependencyProduction',
  1437. 'ProbabilisticDependencyGrammar',
  1438. 'induce_pcfg',
  1439. 'read_grammar',
  1440. ]