chart.py 61 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: A Chart Parser
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Edward Loper <edloper@gmail.com>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # Jean Mark Gawron <gawron@mail.sdsu.edu>
  8. # Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. """
  12. Data classes and parser implementations for "chart parsers", which
  13. use dynamic programming to efficiently parse a text. A chart
  14. parser derives parse trees for a text by iteratively adding "edges"
  15. to a "chart." Each edge represents a hypothesis about the tree
  16. structure for a subsequence of the text. The chart is a
  17. "blackboard" for composing and combining these hypotheses.
  18. When a chart parser begins parsing a text, it creates a new (empty)
  19. chart, spanning the text. It then incrementally adds new edges to the
  20. chart. A set of "chart rules" specifies the conditions under which
  21. new edges should be added to the chart. Once the chart reaches a
  22. stage where none of the chart rules adds any new edges, parsing is
  23. complete.
  24. Charts are encoded with the ``Chart`` class, and edges are encoded with
  25. the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module
  26. defines three chart parsers:
  27. - ``ChartParser`` is a simple and flexible chart parser. Given a
  28. set of chart rules, it will apply those rules to the chart until
  29. no more edges are added.
  30. - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
  31. be used to step through the parsing process.
  32. """
  33. from __future__ import print_function, division, unicode_literals
  34. import itertools
  35. import re
  36. import warnings
  37. from functools import total_ordering
  38. from six.moves import range
  39. from nltk.tree import Tree
  40. from nltk.grammar import PCFG, is_nonterminal, is_terminal
  41. from nltk.util import OrderedDict
  42. from nltk.internals import raise_unorderable_types
  43. from nltk.compat import python_2_unicode_compatible, unicode_repr
  44. from nltk.parse.api import ParserI
  45. ########################################################################
  46. ## Edges
  47. ########################################################################
  48. @total_ordering
  49. class EdgeI(object):
  50. """
  51. A hypothesis about the structure of part of a sentence.
  52. Each edge records the fact that a structure is (partially)
  53. consistent with the sentence. An edge contains:
  54. - A span, indicating what part of the sentence is
  55. consistent with the hypothesized structure.
  56. - A left-hand side, specifying what kind of structure is
  57. hypothesized.
  58. - A right-hand side, specifying the contents of the
  59. hypothesized structure.
  60. - A dot position, indicating how much of the hypothesized
  61. structure is consistent with the sentence.
  62. Every edge is either complete or incomplete:
  63. - An edge is complete if its structure is fully consistent
  64. with the sentence.
  65. - An edge is incomplete if its structure is partially
  66. consistent with the sentence. For every incomplete edge, the
  67. span specifies a possible prefix for the edge's structure.
  68. There are two kinds of edge:
  69. - A ``TreeEdge`` records which trees have been found to
  70. be (partially) consistent with the text.
  71. - A ``LeafEdge`` records the tokens occurring in the text.
  72. The ``EdgeI`` interface provides a common interface to both types
  73. of edge, allowing chart parsers to treat them in a uniform manner.
  74. """
  75. def __init__(self):
  76. if self.__class__ == EdgeI:
  77. raise TypeError('Edge is an abstract interface')
  78. # ////////////////////////////////////////////////////////////
  79. # Span
  80. # ////////////////////////////////////////////////////////////
  81. def span(self):
  82. """
  83. Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the
  84. portion of the sentence that is consistent with this
  85. edge's structure.
  86. :rtype: tuple(int, int)
  87. """
  88. raise NotImplementedError()
  89. def start(self):
  90. """
  91. Return the start index of this edge's span.
  92. :rtype: int
  93. """
  94. raise NotImplementedError()
  95. def end(self):
  96. """
  97. Return the end index of this edge's span.
  98. :rtype: int
  99. """
  100. raise NotImplementedError()
  101. def length(self):
  102. """
  103. Return the length of this edge's span.
  104. :rtype: int
  105. """
  106. raise NotImplementedError()
  107. # ////////////////////////////////////////////////////////////
  108. # Left Hand Side
  109. # ////////////////////////////////////////////////////////////
  110. def lhs(self):
  111. """
  112. Return this edge's left-hand side, which specifies what kind
  113. of structure is hypothesized by this edge.
  114. :see: ``TreeEdge`` and ``LeafEdge`` for a description of
  115. the left-hand side values for each edge type.
  116. """
  117. raise NotImplementedError()
  118. # ////////////////////////////////////////////////////////////
  119. # Right Hand Side
  120. # ////////////////////////////////////////////////////////////
  121. def rhs(self):
  122. """
  123. Return this edge's right-hand side, which specifies
  124. the content of the structure hypothesized by this edge.
  125. :see: ``TreeEdge`` and ``LeafEdge`` for a description of
  126. the right-hand side values for each edge type.
  127. """
  128. raise NotImplementedError()
  129. def dot(self):
  130. """
  131. Return this edge's dot position, which indicates how much of
  132. the hypothesized structure is consistent with the
  133. sentence. In particular, ``self.rhs[:dot]`` is consistent
  134. with ``tokens[self.start():self.end()]``.
  135. :rtype: int
  136. """
  137. raise NotImplementedError()
  138. def nextsym(self):
  139. """
  140. Return the element of this edge's right-hand side that
  141. immediately follows its dot.
  142. :rtype: Nonterminal or terminal or None
  143. """
  144. raise NotImplementedError()
  145. def is_complete(self):
  146. """
  147. Return True if this edge's structure is fully consistent
  148. with the text.
  149. :rtype: bool
  150. """
  151. raise NotImplementedError()
  152. def is_incomplete(self):
  153. """
  154. Return True if this edge's structure is partially consistent
  155. with the text.
  156. :rtype: bool
  157. """
  158. raise NotImplementedError()
  159. # ////////////////////////////////////////////////////////////
  160. # Comparisons & hashing
  161. # ////////////////////////////////////////////////////////////
  162. def __eq__(self, other):
  163. return (
  164. self.__class__ is other.__class__
  165. and self._comparison_key == other._comparison_key
  166. )
  167. def __ne__(self, other):
  168. return not self == other
  169. def __lt__(self, other):
  170. if not isinstance(other, EdgeI):
  171. raise_unorderable_types("<", self, other)
  172. if self.__class__ is other.__class__:
  173. return self._comparison_key < other._comparison_key
  174. else:
  175. return self.__class__.__name__ < other.__class__.__name__
  176. def __hash__(self):
  177. try:
  178. return self._hash
  179. except AttributeError:
  180. self._hash = hash(self._comparison_key)
  181. return self._hash
  182. @python_2_unicode_compatible
  183. class TreeEdge(EdgeI):
  184. """
  185. An edge that records the fact that a tree is (partially)
  186. consistent with the sentence. A tree edge consists of:
  187. - A span, indicating what part of the sentence is
  188. consistent with the hypothesized tree.
  189. - A left-hand side, specifying the hypothesized tree's node
  190. value.
  191. - A right-hand side, specifying the hypothesized tree's
  192. children. Each element of the right-hand side is either a
  193. terminal, specifying a token with that terminal as its leaf
  194. value; or a nonterminal, specifying a subtree with that
  195. nonterminal's symbol as its node value.
  196. - A dot position, indicating which children are consistent
  197. with part of the sentence. In particular, if ``dot`` is the
  198. dot position, ``rhs`` is the right-hand size, ``(start,end)``
  199. is the span, and ``sentence`` is the list of tokens in the
  200. sentence, then ``tokens[start:end]`` can be spanned by the
  201. children specified by ``rhs[:dot]``.
  202. For more information about edges, see the ``EdgeI`` interface.
  203. """
  204. def __init__(self, span, lhs, rhs, dot=0):
  205. """
  206. Construct a new ``TreeEdge``.
  207. :type span: tuple(int, int)
  208. :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the
  209. portion of the sentence that is consistent with the new
  210. edge's structure.
  211. :type lhs: Nonterminal
  212. :param lhs: The new edge's left-hand side, specifying the
  213. hypothesized tree's node value.
  214. :type rhs: list(Nonterminal and str)
  215. :param rhs: The new edge's right-hand side, specifying the
  216. hypothesized tree's children.
  217. :type dot: int
  218. :param dot: The position of the new edge's dot. This position
  219. specifies what prefix of the production's right hand side
  220. is consistent with the text. In particular, if
  221. ``sentence`` is the list of tokens in the sentence, then
  222. ``okens[span[0]:span[1]]`` can be spanned by the
  223. children specified by ``rhs[:dot]``.
  224. """
  225. self._span = span
  226. self._lhs = lhs
  227. rhs = tuple(rhs)
  228. self._rhs = rhs
  229. self._dot = dot
  230. self._comparison_key = (span, lhs, rhs, dot)
  231. @staticmethod
  232. def from_production(production, index):
  233. """
  234. Return a new ``TreeEdge`` formed from the given production.
  235. The new edge's left-hand side and right-hand side will
  236. be taken from ``production``; its span will be
  237. ``(index,index)``; and its dot position will be ``0``.
  238. :rtype: TreeEdge
  239. """
  240. return TreeEdge(
  241. span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
  242. )
  243. def move_dot_forward(self, new_end):
  244. """
  245. Return a new ``TreeEdge`` formed from this edge.
  246. The new edge's dot position is increased by ``1``,
  247. and its end index will be replaced by ``new_end``.
  248. :param new_end: The new end index.
  249. :type new_end: int
  250. :rtype: TreeEdge
  251. """
  252. return TreeEdge(
  253. span=(self._span[0], new_end),
  254. lhs=self._lhs,
  255. rhs=self._rhs,
  256. dot=self._dot + 1,
  257. )
  258. # Accessors
  259. def lhs(self):
  260. return self._lhs
  261. def span(self):
  262. return self._span
  263. def start(self):
  264. return self._span[0]
  265. def end(self):
  266. return self._span[1]
  267. def length(self):
  268. return self._span[1] - self._span[0]
  269. def rhs(self):
  270. return self._rhs
  271. def dot(self):
  272. return self._dot
  273. def is_complete(self):
  274. return self._dot == len(self._rhs)
  275. def is_incomplete(self):
  276. return self._dot != len(self._rhs)
  277. def nextsym(self):
  278. if self._dot >= len(self._rhs):
  279. return None
  280. else:
  281. return self._rhs[self._dot]
  282. # String representation
  283. def __str__(self):
  284. str = '[%s:%s] ' % (self._span[0], self._span[1])
  285. str += '%-2r ->' % (self._lhs,)
  286. for i in range(len(self._rhs)):
  287. if i == self._dot:
  288. str += ' *'
  289. str += ' %s' % unicode_repr(self._rhs[i])
  290. if len(self._rhs) == self._dot:
  291. str += ' *'
  292. return str
  293. def __repr__(self):
  294. return '[Edge: %s]' % self
  295. @python_2_unicode_compatible
  296. class LeafEdge(EdgeI):
  297. """
  298. An edge that records the fact that a leaf value is consistent with
  299. a word in the sentence. A leaf edge consists of:
  300. - An index, indicating the position of the word.
  301. - A leaf, specifying the word's content.
  302. A leaf edge's left-hand side is its leaf value, and its right hand
  303. side is ``()``. Its span is ``[index, index+1]``, and its dot
  304. position is ``0``.
  305. """
  306. def __init__(self, leaf, index):
  307. """
  308. Construct a new ``LeafEdge``.
  309. :param leaf: The new edge's leaf value, specifying the word
  310. that is recorded by this edge.
  311. :param index: The new edge's index, specifying the position of
  312. the word that is recorded by this edge.
  313. """
  314. self._leaf = leaf
  315. self._index = index
  316. self._comparison_key = (leaf, index)
  317. # Accessors
  318. def lhs(self):
  319. return self._leaf
  320. def span(self):
  321. return (self._index, self._index + 1)
  322. def start(self):
  323. return self._index
  324. def end(self):
  325. return self._index + 1
  326. def length(self):
  327. return 1
  328. def rhs(self):
  329. return ()
  330. def dot(self):
  331. return 0
  332. def is_complete(self):
  333. return True
  334. def is_incomplete(self):
  335. return False
  336. def nextsym(self):
  337. return None
  338. # String representations
  339. def __str__(self):
  340. return '[%s:%s] %s' % (self._index, self._index + 1, unicode_repr(self._leaf))
  341. def __repr__(self):
  342. return '[Edge: %s]' % (self)
  343. ########################################################################
  344. ## Chart
  345. ########################################################################
  346. class Chart(object):
  347. """
  348. A blackboard for hypotheses about the syntactic constituents of a
  349. sentence. A chart contains a set of edges, and each edge encodes
  350. a single hypothesis about the structure of some portion of the
  351. sentence.
  352. The ``select`` method can be used to select a specific collection
  353. of edges. For example ``chart.select(is_complete=True, start=0)``
  354. yields all complete edges whose start indices are 0. To ensure
  355. the efficiency of these selection operations, ``Chart`` dynamically
  356. creates and maintains an index for each set of attributes that
  357. have been selected on.
  358. In order to reconstruct the trees that are represented by an edge,
  359. the chart associates each edge with a set of child pointer lists.
  360. A child pointer list is a list of the edges that license an
  361. edge's right-hand side.
  362. :ivar _tokens: The sentence that the chart covers.
  363. :ivar _num_leaves: The number of tokens.
  364. :ivar _edges: A list of the edges in the chart
  365. :ivar _edge_to_cpls: A dictionary mapping each edge to a set
  366. of child pointer lists that are associated with that edge.
  367. :ivar _indexes: A dictionary mapping tuples of edge attributes
  368. to indices, where each index maps the corresponding edge
  369. attribute values to lists of edges.
  370. """
  371. def __init__(self, tokens):
  372. """
  373. Construct a new chart. The chart is initialized with the
  374. leaf edges corresponding to the terminal leaves.
  375. :type tokens: list
  376. :param tokens: The sentence that this chart will be used to parse.
  377. """
  378. # Record the sentence token and the sentence length.
  379. self._tokens = tuple(tokens)
  380. self._num_leaves = len(self._tokens)
  381. # Initialise the chart.
  382. self.initialize()
  383. def initialize(self):
  384. """
  385. Clear the chart.
  386. """
  387. # A list of edges contained in this chart.
  388. self._edges = []
  389. # The set of child pointer lists associated with each edge.
  390. self._edge_to_cpls = {}
  391. # Indexes mapping attribute values to lists of edges
  392. # (used by select()).
  393. self._indexes = {}
  394. # ////////////////////////////////////////////////////////////
  395. # Sentence Access
  396. # ////////////////////////////////////////////////////////////
  397. def num_leaves(self):
  398. """
  399. Return the number of words in this chart's sentence.
  400. :rtype: int
  401. """
  402. return self._num_leaves
  403. def leaf(self, index):
  404. """
  405. Return the leaf value of the word at the given index.
  406. :rtype: str
  407. """
  408. return self._tokens[index]
  409. def leaves(self):
  410. """
  411. Return a list of the leaf values of each word in the
  412. chart's sentence.
  413. :rtype: list(str)
  414. """
  415. return self._tokens
  416. # ////////////////////////////////////////////////////////////
  417. # Edge access
  418. # ////////////////////////////////////////////////////////////
  419. def edges(self):
  420. """
  421. Return a list of all edges in this chart. New edges
  422. that are added to the chart after the call to edges()
  423. will *not* be contained in this list.
  424. :rtype: list(EdgeI)
  425. :see: ``iteredges``, ``select``
  426. """
  427. return self._edges[:]
  428. def iteredges(self):
  429. """
  430. Return an iterator over the edges in this chart. It is
  431. not guaranteed that new edges which are added to the
  432. chart before the iterator is exhausted will also be generated.
  433. :rtype: iter(EdgeI)
  434. :see: ``edges``, ``select``
  435. """
  436. return iter(self._edges)
  437. # Iterating over the chart yields its edges.
  438. __iter__ = iteredges
  439. def num_edges(self):
  440. """
  441. Return the number of edges contained in this chart.
  442. :rtype: int
  443. """
  444. return len(self._edge_to_cpls)
  445. def select(self, **restrictions):
  446. """
  447. Return an iterator over the edges in this chart. Any
  448. new edges that are added to the chart before the iterator
  449. is exahusted will also be generated. ``restrictions``
  450. can be used to restrict the set of edges that will be
  451. generated.
  452. :param span: Only generate edges ``e`` where ``e.span()==span``
  453. :param start: Only generate edges ``e`` where ``e.start()==start``
  454. :param end: Only generate edges ``e`` where ``e.end()==end``
  455. :param length: Only generate edges ``e`` where ``e.length()==length``
  456. :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs``
  457. :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs``
  458. :param nextsym: Only generate edges ``e`` where
  459. ``e.nextsym()==nextsym``
  460. :param dot: Only generate edges ``e`` where ``e.dot()==dot``
  461. :param is_complete: Only generate edges ``e`` where
  462. ``e.is_complete()==is_complete``
  463. :param is_incomplete: Only generate edges ``e`` where
  464. ``e.is_incomplete()==is_incomplete``
  465. :rtype: iter(EdgeI)
  466. """
  467. # If there are no restrictions, then return all edges.
  468. if restrictions == {}:
  469. return iter(self._edges)
  470. # Find the index corresponding to the given restrictions.
  471. restr_keys = sorted(restrictions.keys())
  472. restr_keys = tuple(restr_keys)
  473. # If it doesn't exist, then create it.
  474. if restr_keys not in self._indexes:
  475. self._add_index(restr_keys)
  476. vals = tuple(restrictions[key] for key in restr_keys)
  477. return iter(self._indexes[restr_keys].get(vals, []))
  478. def _add_index(self, restr_keys):
  479. """
  480. A helper function for ``select``, which creates a new index for
  481. a given set of attributes (aka restriction keys).
  482. """
  483. # Make sure it's a valid index.
  484. for key in restr_keys:
  485. if not hasattr(EdgeI, key):
  486. raise ValueError('Bad restriction: %s' % key)
  487. # Create the index.
  488. index = self._indexes[restr_keys] = {}
  489. # Add all existing edges to the index.
  490. for edge in self._edges:
  491. vals = tuple(getattr(edge, key)() for key in restr_keys)
  492. index.setdefault(vals, []).append(edge)
  493. def _register_with_indexes(self, edge):
  494. """
  495. A helper function for ``insert``, which registers the new
  496. edge with all existing indexes.
  497. """
  498. for (restr_keys, index) in self._indexes.items():
  499. vals = tuple(getattr(edge, key)() for key in restr_keys)
  500. index.setdefault(vals, []).append(edge)
  501. # ////////////////////////////////////////////////////////////
  502. # Edge Insertion
  503. # ////////////////////////////////////////////////////////////
  504. def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
  505. """
  506. Add a new edge to the chart, using a pointer to the previous edge.
  507. """
  508. cpls = self.child_pointer_lists(previous_edge)
  509. new_cpls = [cpl + (child_edge,) for cpl in cpls]
  510. return self.insert(new_edge, *new_cpls)
  511. def insert(self, edge, *child_pointer_lists):
  512. """
  513. Add a new edge to the chart, and return True if this operation
  514. modified the chart. In particular, return true iff the chart
  515. did not already contain ``edge``, or if it did not already associate
  516. ``child_pointer_lists`` with ``edge``.
  517. :type edge: EdgeI
  518. :param edge: The new edge
  519. :type child_pointer_lists: sequence of tuple(EdgeI)
  520. :param child_pointer_lists: A sequence of lists of the edges that
  521. were used to form this edge. This list is used to reconstruct
  522. the trees (or partial trees) that are associated with ``edge``.
  523. :rtype: bool
  524. """
  525. # Is it a new edge?
  526. if edge not in self._edge_to_cpls:
  527. # Add it to the list of edges.
  528. self._append_edge(edge)
  529. # Register with indexes.
  530. self._register_with_indexes(edge)
  531. # Get the set of child pointer lists for this edge.
  532. cpls = self._edge_to_cpls.setdefault(edge, OrderedDict())
  533. chart_was_modified = False
  534. for child_pointer_list in child_pointer_lists:
  535. child_pointer_list = tuple(child_pointer_list)
  536. if child_pointer_list not in cpls:
  537. # It's a new CPL; register it, and return true.
  538. cpls[child_pointer_list] = True
  539. chart_was_modified = True
  540. return chart_was_modified
  541. def _append_edge(self, edge):
  542. self._edges.append(edge)
  543. # ////////////////////////////////////////////////////////////
  544. # Tree extraction & child pointer lists
  545. # ////////////////////////////////////////////////////////////
  546. def parses(self, root, tree_class=Tree):
  547. """
  548. Return an iterator of the complete tree structures that span
  549. the entire chart, and whose root node is ``root``.
  550. """
  551. for edge in self.select(start=0, end=self._num_leaves, lhs=root):
  552. for tree in self.trees(edge, tree_class=tree_class, complete=True):
  553. yield tree
  554. def trees(self, edge, tree_class=Tree, complete=False):
  555. """
  556. Return an iterator of the tree structures that are associated
  557. with ``edge``.
  558. If ``edge`` is incomplete, then the unexpanded children will be
  559. encoded as childless subtrees, whose node value is the
  560. corresponding terminal or nonterminal.
  561. :rtype: list(Tree)
  562. :note: If two trees share a common subtree, then the same
  563. Tree may be used to encode that subtree in
  564. both trees. If you need to eliminate this subtree
  565. sharing, then create a deep copy of each tree.
  566. """
  567. return iter(self._trees(edge, complete, memo={}, tree_class=tree_class))
  568. def _trees(self, edge, complete, memo, tree_class):
  569. """
  570. A helper function for ``trees``.
  571. :param memo: A dictionary used to record the trees that we've
  572. generated for each edge, so that when we see an edge more
  573. than once, we can reuse the same trees.
  574. """
  575. # If we've seen this edge before, then reuse our old answer.
  576. if edge in memo:
  577. return memo[edge]
  578. # when we're reading trees off the chart, don't use incomplete edges
  579. if complete and edge.is_incomplete():
  580. return []
  581. # Leaf edges.
  582. if isinstance(edge, LeafEdge):
  583. leaf = self._tokens[edge.start()]
  584. memo[edge] = [leaf]
  585. return [leaf]
  586. # Until we're done computing the trees for edge, set
  587. # memo[edge] to be empty. This has the effect of filtering
  588. # out any cyclic trees (i.e., trees that contain themselves as
  589. # descendants), because if we reach this edge via a cycle,
  590. # then it will appear that the edge doesn't generate any trees.
  591. memo[edge] = []
  592. trees = []
  593. lhs = edge.lhs().symbol()
  594. # Each child pointer list can be used to form trees.
  595. for cpl in self.child_pointer_lists(edge):
  596. # Get the set of child choices for each child pointer.
  597. # child_choices[i] is the set of choices for the tree's
  598. # ith child.
  599. child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
  600. # For each combination of children, add a tree.
  601. for children in itertools.product(*child_choices):
  602. trees.append(tree_class(lhs, children))
  603. # If the edge is incomplete, then extend it with "partial trees":
  604. if edge.is_incomplete():
  605. unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]]
  606. for tree in trees:
  607. tree.extend(unexpanded)
  608. # Update the memoization dictionary.
  609. memo[edge] = trees
  610. # Return the list of trees.
  611. return trees
  612. def child_pointer_lists(self, edge):
  613. """
  614. Return the set of child pointer lists for the given edge.
  615. Each child pointer list is a list of edges that have
  616. been used to form this edge.
  617. :rtype: list(list(EdgeI))
  618. """
  619. # Make a copy, in case they modify it.
  620. return self._edge_to_cpls.get(edge, {}).keys()
  621. # ////////////////////////////////////////////////////////////
  622. # Display
  623. # ////////////////////////////////////////////////////////////
  624. def pretty_format_edge(self, edge, width=None):
  625. """
  626. Return a pretty-printed string representation of a given edge
  627. in this chart.
  628. :rtype: str
  629. :param width: The number of characters allotted to each
  630. index in the sentence.
  631. """
  632. if width is None:
  633. width = 50 // (self.num_leaves() + 1)
  634. (start, end) = (edge.start(), edge.end())
  635. str = '|' + ('.' + ' ' * (width - 1)) * start
  636. # Zero-width edges are "#" if complete, ">" if incomplete
  637. if start == end:
  638. if edge.is_complete():
  639. str += '#'
  640. else:
  641. str += '>'
  642. # Spanning complete edges are "[===]"; Other edges are
  643. # "[---]" if complete, "[--->" if incomplete
  644. elif edge.is_complete() and edge.span() == (0, self._num_leaves):
  645. str += '[' + ('=' * width) * (end - start - 1) + '=' * (width - 1) + ']'
  646. elif edge.is_complete():
  647. str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + ']'
  648. else:
  649. str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + '>'
  650. str += (' ' * (width - 1) + '.') * (self._num_leaves - end)
  651. return str + '| %s' % edge
  652. def pretty_format_leaves(self, width=None):
  653. """
  654. Return a pretty-printed string representation of this
  655. chart's leaves. This string can be used as a header
  656. for calls to ``pretty_format_edge``.
  657. """
  658. if width is None:
  659. width = 50 // (self.num_leaves() + 1)
  660. if self._tokens is not None and width > 1:
  661. header = '|.'
  662. for tok in self._tokens:
  663. header += tok[: width - 1].center(width - 1) + '.'
  664. header += '|'
  665. else:
  666. header = ''
  667. return header
  668. def pretty_format(self, width=None):
  669. """
  670. Return a pretty-printed string representation of this chart.
  671. :param width: The number of characters allotted to each
  672. index in the sentence.
  673. :rtype: str
  674. """
  675. if width is None:
  676. width = 50 // (self.num_leaves() + 1)
  677. # sort edges: primary key=length, secondary key=start index.
  678. # (and filter out the token edges)
  679. edges = sorted([(e.length(), e.start(), e) for e in self])
  680. edges = [e for (_, _, e) in edges]
  681. return (
  682. self.pretty_format_leaves(width)
  683. + '\n'
  684. + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges)
  685. )
  686. # ////////////////////////////////////////////////////////////
  687. # Display: Dot (AT&T Graphviz)
  688. # ////////////////////////////////////////////////////////////
  689. def dot_digraph(self):
  690. # Header
  691. s = 'digraph nltk_chart {\n'
  692. # s += ' size="5,5";\n'
  693. s += ' rankdir=LR;\n'
  694. s += ' node [height=0.1,width=0.1];\n'
  695. s += ' node [style=filled, color="lightgray"];\n'
  696. # Set up the nodes
  697. for y in range(self.num_edges(), -1, -1):
  698. if y == 0:
  699. s += ' node [style=filled, color="black"];\n'
  700. for x in range(self.num_leaves() + 1):
  701. if y == 0 or (
  702. x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
  703. ):
  704. s += ' %04d.%04d [label=""];\n' % (x, y)
  705. # Add a spacer
  706. s += ' x [style=invis]; x->0000.0000 [style=invis];\n'
  707. # Declare ranks.
  708. for x in range(self.num_leaves() + 1):
  709. s += ' {rank=same;'
  710. for y in range(self.num_edges() + 1):
  711. if y == 0 or (
  712. x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
  713. ):
  714. s += ' %04d.%04d' % (x, y)
  715. s += '}\n'
  716. # Add the leaves
  717. s += ' edge [style=invis, weight=100];\n'
  718. s += ' node [shape=plaintext]\n'
  719. s += ' 0000.0000'
  720. for x in range(self.num_leaves()):
  721. s += '->%s->%04d.0000' % (self.leaf(x), x + 1)
  722. s += ';\n\n'
  723. # Add the edges
  724. s += ' edge [style=solid, weight=1];\n'
  725. for y, edge in enumerate(self):
  726. for x in range(edge.start()):
  727. s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
  728. x,
  729. y + 1,
  730. x + 1,
  731. y + 1,
  732. )
  733. s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % (
  734. edge.start(),
  735. y + 1,
  736. edge.end(),
  737. y + 1,
  738. edge,
  739. )
  740. for x in range(edge.end(), self.num_leaves()):
  741. s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % (
  742. x,
  743. y + 1,
  744. x + 1,
  745. y + 1,
  746. )
  747. s += '}\n'
  748. return s
  749. ########################################################################
  750. ## Chart Rules
  751. ########################################################################
  752. class ChartRuleI(object):
  753. """
  754. A rule that specifies what new edges are licensed by any given set
  755. of existing edges. Each chart rule expects a fixed number of
  756. edges, as indicated by the class variable ``NUM_EDGES``. In
  757. particular:
  758. - A chart rule with ``NUM_EDGES=0`` specifies what new edges are
  759. licensed, regardless of existing edges.
  760. - A chart rule with ``NUM_EDGES=1`` specifies what new edges are
  761. licensed by a single existing edge.
  762. - A chart rule with ``NUM_EDGES=2`` specifies what new edges are
  763. licensed by a pair of existing edges.
  764. :type NUM_EDGES: int
  765. :cvar NUM_EDGES: The number of existing edges that this rule uses
  766. to license new edges. Typically, this number ranges from zero
  767. to two.
  768. """
  769. def apply(self, chart, grammar, *edges):
  770. """
  771. Return a generator that will add edges licensed by this rule
  772. and the given edges to the chart, one at a time. Each
  773. time the generator is resumed, it will either add a new
  774. edge and yield that edge; or return.
  775. :type edges: list(EdgeI)
  776. :param edges: A set of existing edges. The number of edges
  777. that should be passed to ``apply()`` is specified by the
  778. ``NUM_EDGES`` class variable.
  779. :rtype: iter(EdgeI)
  780. """
  781. raise NotImplementedError()
  782. def apply_everywhere(self, chart, grammar):
  783. """
  784. Return a generator that will add all edges licensed by
  785. this rule, given the edges that are currently in the
  786. chart, one at a time. Each time the generator is resumed,
  787. it will either add a new edge and yield that edge; or return.
  788. :rtype: iter(EdgeI)
  789. """
  790. raise NotImplementedError()
  791. @python_2_unicode_compatible
  792. class AbstractChartRule(ChartRuleI):
  793. """
  794. An abstract base class for chart rules. ``AbstractChartRule``
  795. provides:
  796. - A default implementation for ``apply``.
  797. - A default implementation for ``apply_everywhere``,
  798. (Currently, this implementation assumes that ``NUM_EDGES``<=3.)
  799. - A default implementation for ``__str__``, which returns a
  800. name based on the rule's class name.
  801. """
  802. # Subclasses must define apply.
  803. def apply(self, chart, grammar, *edges):
  804. raise NotImplementedError()
  805. # Default: loop through the given number of edges, and call
  806. # self.apply() for each set of edges.
  807. def apply_everywhere(self, chart, grammar):
  808. if self.NUM_EDGES == 0:
  809. for new_edge in self.apply(chart, grammar):
  810. yield new_edge
  811. elif self.NUM_EDGES == 1:
  812. for e1 in chart:
  813. for new_edge in self.apply(chart, grammar, e1):
  814. yield new_edge
  815. elif self.NUM_EDGES == 2:
  816. for e1 in chart:
  817. for e2 in chart:
  818. for new_edge in self.apply(chart, grammar, e1, e2):
  819. yield new_edge
  820. elif self.NUM_EDGES == 3:
  821. for e1 in chart:
  822. for e2 in chart:
  823. for e3 in chart:
  824. for new_edge in self.apply(chart, grammar, e1, e2, e3):
  825. yield new_edge
  826. else:
  827. raise AssertionError('NUM_EDGES>3 is not currently supported')
  828. # Default: return a name based on the class name.
  829. def __str__(self):
  830. # Add spaces between InitialCapsWords.
  831. return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
  832. # ////////////////////////////////////////////////////////////
  833. # Fundamental Rule
  834. # ////////////////////////////////////////////////////////////
  835. class FundamentalRule(AbstractChartRule):
  836. """
  837. A rule that joins two adjacent edges to form a single combined
  838. edge. In particular, this rule specifies that any pair of edges
  839. - ``[A -> alpha \* B beta][i:j]``
  840. - ``[B -> gamma \*][j:k]``
  841. licenses the edge:
  842. - ``[A -> alpha B * beta][i:j]``
  843. """
  844. NUM_EDGES = 2
  845. def apply(self, chart, grammar, left_edge, right_edge):
  846. # Make sure the rule is applicable.
  847. if not (
  848. left_edge.is_incomplete()
  849. and right_edge.is_complete()
  850. and left_edge.end() == right_edge.start()
  851. and left_edge.nextsym() == right_edge.lhs()
  852. ):
  853. return
  854. # Construct the new edge.
  855. new_edge = left_edge.move_dot_forward(right_edge.end())
  856. # Insert it into the chart.
  857. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  858. yield new_edge
  859. class SingleEdgeFundamentalRule(FundamentalRule):
  860. """
  861. A rule that joins a given edge with adjacent edges in the chart,
  862. to form combined edges. In particular, this rule specifies that
  863. either of the edges:
  864. - ``[A -> alpha \* B beta][i:j]``
  865. - ``[B -> gamma \*][j:k]``
  866. licenses the edge:
  867. - ``[A -> alpha B * beta][i:j]``
  868. if the other edge is already in the chart.
  869. :note: This is basically ``FundamentalRule``, with one edge left
  870. unspecified.
  871. """
  872. NUM_EDGES = 1
  873. def apply(self, chart, grammar, edge):
  874. if edge.is_incomplete():
  875. for new_edge in self._apply_incomplete(chart, grammar, edge):
  876. yield new_edge
  877. else:
  878. for new_edge in self._apply_complete(chart, grammar, edge):
  879. yield new_edge
  880. def _apply_complete(self, chart, grammar, right_edge):
  881. for left_edge in chart.select(
  882. end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
  883. ):
  884. new_edge = left_edge.move_dot_forward(right_edge.end())
  885. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  886. yield new_edge
  887. def _apply_incomplete(self, chart, grammar, left_edge):
  888. for right_edge in chart.select(
  889. start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
  890. ):
  891. new_edge = left_edge.move_dot_forward(right_edge.end())
  892. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  893. yield new_edge
  894. # ////////////////////////////////////////////////////////////
  895. # Inserting Terminal Leafs
  896. # ////////////////////////////////////////////////////////////
  897. class LeafInitRule(AbstractChartRule):
  898. NUM_EDGES = 0
  899. def apply(self, chart, grammar):
  900. for index in range(chart.num_leaves()):
  901. new_edge = LeafEdge(chart.leaf(index), index)
  902. if chart.insert(new_edge, ()):
  903. yield new_edge
  904. # ////////////////////////////////////////////////////////////
  905. # Top-Down Prediction
  906. # ////////////////////////////////////////////////////////////
  907. class TopDownInitRule(AbstractChartRule):
  908. """
  909. A rule licensing edges corresponding to the grammar productions for
  910. the grammar's start symbol. In particular, this rule specifies that
  911. ``[S -> \* alpha][0:i]`` is licensed for each grammar production
  912. ``S -> alpha``, where ``S`` is the grammar's start symbol.
  913. """
  914. NUM_EDGES = 0
  915. def apply(self, chart, grammar):
  916. for prod in grammar.productions(lhs=grammar.start()):
  917. new_edge = TreeEdge.from_production(prod, 0)
  918. if chart.insert(new_edge, ()):
  919. yield new_edge
  920. class TopDownPredictRule(AbstractChartRule):
  921. """
  922. A rule licensing edges corresponding to the grammar productions
  923. for the nonterminal following an incomplete edge's dot. In
  924. particular, this rule specifies that
  925. ``[A -> alpha \* B beta][i:j]`` licenses the edge
  926. ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``.
  927. :note: This rule corresponds to the Predictor Rule in Earley parsing.
  928. """
  929. NUM_EDGES = 1
  930. def apply(self, chart, grammar, edge):
  931. if edge.is_complete():
  932. return
  933. for prod in grammar.productions(lhs=edge.nextsym()):
  934. new_edge = TreeEdge.from_production(prod, edge.end())
  935. if chart.insert(new_edge, ()):
  936. yield new_edge
  937. class CachedTopDownPredictRule(TopDownPredictRule):
  938. """
  939. A cached version of ``TopDownPredictRule``. After the first time
  940. this rule is applied to an edge with a given ``end`` and ``next``,
  941. it will not generate any more edges for edges with that ``end`` and
  942. ``next``.
  943. If ``chart`` or ``grammar`` are changed, then the cache is flushed.
  944. """
  945. def __init__(self):
  946. TopDownPredictRule.__init__(self)
  947. self._done = {}
  948. def apply(self, chart, grammar, edge):
  949. if edge.is_complete():
  950. return
  951. nextsym, index = edge.nextsym(), edge.end()
  952. if not is_nonterminal(nextsym):
  953. return
  954. # If we've already applied this rule to an edge with the same
  955. # next & end, and the chart & grammar have not changed, then
  956. # just return (no new edges to add).
  957. done = self._done.get((nextsym, index), (None, None))
  958. if done[0] is chart and done[1] is grammar:
  959. return
  960. # Add all the edges indicated by the top down expand rule.
  961. for prod in grammar.productions(lhs=nextsym):
  962. # If the left corner in the predicted production is
  963. # leaf, it must match with the input.
  964. if prod.rhs():
  965. first = prod.rhs()[0]
  966. if is_terminal(first):
  967. if index >= chart.num_leaves() or first != chart.leaf(index):
  968. continue
  969. new_edge = TreeEdge.from_production(prod, index)
  970. if chart.insert(new_edge, ()):
  971. yield new_edge
  972. # Record the fact that we've applied this rule.
  973. self._done[nextsym, index] = (chart, grammar)
  974. # ////////////////////////////////////////////////////////////
  975. # Bottom-Up Prediction
  976. # ////////////////////////////////////////////////////////////
  977. class BottomUpPredictRule(AbstractChartRule):
  978. """
  979. A rule licensing any edge corresponding to a production whose
  980. right-hand side begins with a complete edge's left-hand side. In
  981. particular, this rule specifies that ``[A -> alpha \*]`` licenses
  982. the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
  983. """
  984. NUM_EDGES = 1
  985. def apply(self, chart, grammar, edge):
  986. if edge.is_incomplete():
  987. return
  988. for prod in grammar.productions(rhs=edge.lhs()):
  989. new_edge = TreeEdge.from_production(prod, edge.start())
  990. if chart.insert(new_edge, ()):
  991. yield new_edge
  992. class BottomUpPredictCombineRule(BottomUpPredictRule):
  993. """
  994. A rule licensing any edge corresponding to a production whose
  995. right-hand side begins with a complete edge's left-hand side. In
  996. particular, this rule specifies that ``[A -> alpha \*]``
  997. licenses the edge ``[B -> A \* beta]`` for each grammar
  998. production ``B -> A beta``.
  999. :note: This is like ``BottomUpPredictRule``, but it also applies
  1000. the ``FundamentalRule`` to the resulting edge.
  1001. """
  1002. NUM_EDGES = 1
  1003. def apply(self, chart, grammar, edge):
  1004. if edge.is_incomplete():
  1005. return
  1006. for prod in grammar.productions(rhs=edge.lhs()):
  1007. new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
  1008. if chart.insert(new_edge, (edge,)):
  1009. yield new_edge
  1010. class EmptyPredictRule(AbstractChartRule):
  1011. """
  1012. A rule that inserts all empty productions as passive edges,
  1013. in every position in the chart.
  1014. """
  1015. NUM_EDGES = 0
  1016. def apply(self, chart, grammar):
  1017. for prod in grammar.productions(empty=True):
  1018. for index in range(chart.num_leaves() + 1):
  1019. new_edge = TreeEdge.from_production(prod, index)
  1020. if chart.insert(new_edge, ()):
  1021. yield new_edge
  1022. ########################################################################
  1023. ## Filtered Bottom Up
  1024. ########################################################################
  1025. class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
  1026. def _apply_complete(self, chart, grammar, right_edge):
  1027. end = right_edge.end()
  1028. nexttoken = end < chart.num_leaves() and chart.leaf(end)
  1029. for left_edge in chart.select(
  1030. end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
  1031. ):
  1032. if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
  1033. new_edge = left_edge.move_dot_forward(right_edge.end())
  1034. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  1035. yield new_edge
  1036. def _apply_incomplete(self, chart, grammar, left_edge):
  1037. for right_edge in chart.select(
  1038. start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
  1039. ):
  1040. end = right_edge.end()
  1041. nexttoken = end < chart.num_leaves() and chart.leaf(end)
  1042. if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
  1043. new_edge = left_edge.move_dot_forward(right_edge.end())
  1044. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  1045. yield new_edge
  1046. class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
  1047. def apply(self, chart, grammar, edge):
  1048. if edge.is_incomplete():
  1049. return
  1050. end = edge.end()
  1051. nexttoken = end < chart.num_leaves() and chart.leaf(end)
  1052. for prod in grammar.productions(rhs=edge.lhs()):
  1053. if _bottomup_filter(grammar, nexttoken, prod.rhs()):
  1054. new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
  1055. if chart.insert(new_edge, (edge,)):
  1056. yield new_edge
  1057. def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
  1058. if len(rhs) <= dot + 1:
  1059. return True
  1060. _next = rhs[dot + 1]
  1061. if is_terminal(_next):
  1062. return nexttoken == _next
  1063. else:
  1064. return grammar.is_leftcorner(_next, nexttoken)
  1065. ########################################################################
  1066. ## Generic Chart Parser
  1067. ########################################################################
  1068. TD_STRATEGY = [
  1069. LeafInitRule(),
  1070. TopDownInitRule(),
  1071. CachedTopDownPredictRule(),
  1072. SingleEdgeFundamentalRule(),
  1073. ]
  1074. BU_STRATEGY = [
  1075. LeafInitRule(),
  1076. EmptyPredictRule(),
  1077. BottomUpPredictRule(),
  1078. SingleEdgeFundamentalRule(),
  1079. ]
  1080. BU_LC_STRATEGY = [
  1081. LeafInitRule(),
  1082. EmptyPredictRule(),
  1083. BottomUpPredictCombineRule(),
  1084. SingleEdgeFundamentalRule(),
  1085. ]
  1086. LC_STRATEGY = [
  1087. LeafInitRule(),
  1088. FilteredBottomUpPredictCombineRule(),
  1089. FilteredSingleEdgeFundamentalRule(),
  1090. ]
  1091. class ChartParser(ParserI):
  1092. """
  1093. A generic chart parser. A "strategy", or list of
  1094. ``ChartRuleI`` instances, is used to decide what edges to add to
  1095. the chart. In particular, ``ChartParser`` uses the following
  1096. algorithm to parse texts:
  1097. | Until no new edges are added:
  1098. | For each *rule* in *strategy*:
  1099. | Apply *rule* to any applicable edges in the chart.
  1100. | Return any complete parses in the chart
  1101. """
  1102. def __init__(
  1103. self,
  1104. grammar,
  1105. strategy=BU_LC_STRATEGY,
  1106. trace=0,
  1107. trace_chart_width=50,
  1108. use_agenda=True,
  1109. chart_class=Chart,
  1110. ):
  1111. """
  1112. Create a new chart parser, that uses ``grammar`` to parse
  1113. texts.
  1114. :type grammar: CFG
  1115. :param grammar: The grammar used to parse texts.
  1116. :type strategy: list(ChartRuleI)
  1117. :param strategy: A list of rules that should be used to decide
  1118. what edges to add to the chart (top-down strategy by default).
  1119. :type trace: int
  1120. :param trace: The level of tracing that should be used when
  1121. parsing a text. ``0`` will generate no tracing output;
  1122. and higher numbers will produce more verbose tracing
  1123. output.
  1124. :type trace_chart_width: int
  1125. :param trace_chart_width: The default total width reserved for
  1126. the chart in trace output. The remainder of each line will
  1127. be used to display edges.
  1128. :type use_agenda: bool
  1129. :param use_agenda: Use an optimized agenda-based algorithm,
  1130. if possible.
  1131. :param chart_class: The class that should be used to create
  1132. the parse charts.
  1133. """
  1134. self._grammar = grammar
  1135. self._strategy = strategy
  1136. self._trace = trace
  1137. self._trace_chart_width = trace_chart_width
  1138. # If the strategy only consists of axioms (NUM_EDGES==0) and
  1139. # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm:
  1140. self._use_agenda = use_agenda
  1141. self._chart_class = chart_class
  1142. self._axioms = []
  1143. self._inference_rules = []
  1144. for rule in strategy:
  1145. if rule.NUM_EDGES == 0:
  1146. self._axioms.append(rule)
  1147. elif rule.NUM_EDGES == 1:
  1148. self._inference_rules.append(rule)
  1149. else:
  1150. self._use_agenda = False
  1151. def grammar(self):
  1152. return self._grammar
  1153. def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
  1154. if not trace:
  1155. return
  1156. print_rule_header = trace > 1
  1157. for edge in new_edges:
  1158. if print_rule_header:
  1159. print('%s:' % rule)
  1160. print_rule_header = False
  1161. print(chart.pretty_format_edge(edge, edge_width))
  1162. def chart_parse(self, tokens, trace=None):
  1163. """
  1164. Return the final parse ``Chart`` from which all possible
  1165. parse trees can be extracted.
  1166. :param tokens: The sentence to be parsed
  1167. :type tokens: list(str)
  1168. :rtype: Chart
  1169. """
  1170. if trace is None:
  1171. trace = self._trace
  1172. trace_new_edges = self._trace_new_edges
  1173. tokens = list(tokens)
  1174. self._grammar.check_coverage(tokens)
  1175. chart = self._chart_class(tokens)
  1176. grammar = self._grammar
  1177. # Width, for printing trace edges.
  1178. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
  1179. if trace:
  1180. print(chart.pretty_format_leaves(trace_edge_width))
  1181. if self._use_agenda:
  1182. # Use an agenda-based algorithm.
  1183. for axiom in self._axioms:
  1184. new_edges = list(axiom.apply(chart, grammar))
  1185. trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
  1186. inference_rules = self._inference_rules
  1187. agenda = chart.edges()
  1188. # We reverse the initial agenda, since it is a stack
  1189. # but chart.edges() functions as a queue.
  1190. agenda.reverse()
  1191. while agenda:
  1192. edge = agenda.pop()
  1193. for rule in inference_rules:
  1194. new_edges = list(rule.apply(chart, grammar, edge))
  1195. if trace:
  1196. trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
  1197. agenda += new_edges
  1198. else:
  1199. # Do not use an agenda-based algorithm.
  1200. edges_added = True
  1201. while edges_added:
  1202. edges_added = False
  1203. for rule in self._strategy:
  1204. new_edges = list(rule.apply_everywhere(chart, grammar))
  1205. edges_added = len(new_edges)
  1206. trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
  1207. # Return the final chart.
  1208. return chart
  1209. def parse(self, tokens, tree_class=Tree):
  1210. chart = self.chart_parse(tokens)
  1211. return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
  1212. class TopDownChartParser(ChartParser):
  1213. """
  1214. A ``ChartParser`` using a top-down parsing strategy.
  1215. See ``ChartParser`` for more information.
  1216. """
  1217. def __init__(self, grammar, **parser_args):
  1218. ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
  1219. class BottomUpChartParser(ChartParser):
  1220. """
  1221. A ``ChartParser`` using a bottom-up parsing strategy.
  1222. See ``ChartParser`` for more information.
  1223. """
  1224. def __init__(self, grammar, **parser_args):
  1225. if isinstance(grammar, PCFG):
  1226. warnings.warn(
  1227. "BottomUpChartParser only works for CFG, "
  1228. "use BottomUpProbabilisticChartParser instead",
  1229. category=DeprecationWarning,
  1230. )
  1231. ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
  1232. class BottomUpLeftCornerChartParser(ChartParser):
  1233. """
  1234. A ``ChartParser`` using a bottom-up left-corner parsing strategy.
  1235. This strategy is often more efficient than standard bottom-up.
  1236. See ``ChartParser`` for more information.
  1237. """
  1238. def __init__(self, grammar, **parser_args):
  1239. ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
  1240. class LeftCornerChartParser(ChartParser):
  1241. def __init__(self, grammar, **parser_args):
  1242. if not grammar.is_nonempty():
  1243. raise ValueError(
  1244. "LeftCornerParser only works for grammars " "without empty productions."
  1245. )
  1246. ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
  1247. ########################################################################
  1248. ## Stepping Chart Parser
  1249. ########################################################################
  1250. class SteppingChartParser(ChartParser):
  1251. """
  1252. A ``ChartParser`` that allows you to step through the parsing
  1253. process, adding a single edge at a time. It also allows you to
  1254. change the parser's strategy or grammar midway through parsing a
  1255. text.
  1256. The ``initialize`` method is used to start parsing a text. ``step``
  1257. adds a single edge to the chart. ``set_strategy`` changes the
  1258. strategy used by the chart parser. ``parses`` returns the set of
  1259. parses that has been found by the chart parser.
  1260. :ivar _restart: Records whether the parser's strategy, grammar,
  1261. or chart has been changed. If so, then ``step`` must restart
  1262. the parsing algorithm.
  1263. """
  1264. def __init__(self, grammar, strategy=[], trace=0):
  1265. self._chart = None
  1266. self._current_chartrule = None
  1267. self._restart = False
  1268. ChartParser.__init__(self, grammar, strategy, trace)
  1269. # ////////////////////////////////////////////////////////////
  1270. # Initialization
  1271. # ////////////////////////////////////////////////////////////
  1272. def initialize(self, tokens):
  1273. "Begin parsing the given tokens."
  1274. self._chart = Chart(list(tokens))
  1275. self._restart = True
  1276. # ////////////////////////////////////////////////////////////
  1277. # Stepping
  1278. # ////////////////////////////////////////////////////////////
  1279. def step(self):
  1280. """
  1281. Return a generator that adds edges to the chart, one at a
  1282. time. Each time the generator is resumed, it adds a single
  1283. edge and yields that edge. If no more edges can be added,
  1284. then it yields None.
  1285. If the parser's strategy, grammar, or chart is changed, then
  1286. the generator will continue adding edges using the new
  1287. strategy, grammar, or chart.
  1288. Note that this generator never terminates, since the grammar
  1289. or strategy might be changed to values that would add new
  1290. edges. Instead, it yields None when no more edges can be
  1291. added with the current strategy and grammar.
  1292. """
  1293. if self._chart is None:
  1294. raise ValueError('Parser must be initialized first')
  1295. while True:
  1296. self._restart = False
  1297. w = 50 // (self._chart.num_leaves() + 1)
  1298. for e in self._parse():
  1299. if self._trace > 1:
  1300. print(self._current_chartrule)
  1301. if self._trace > 0:
  1302. print(self._chart.pretty_format_edge(e, w))
  1303. yield e
  1304. if self._restart:
  1305. break
  1306. else:
  1307. yield None # No more edges.
  1308. def _parse(self):
  1309. """
  1310. A generator that implements the actual parsing algorithm.
  1311. ``step`` iterates through this generator, and restarts it
  1312. whenever the parser's strategy, grammar, or chart is modified.
  1313. """
  1314. chart = self._chart
  1315. grammar = self._grammar
  1316. edges_added = 1
  1317. while edges_added > 0:
  1318. edges_added = 0
  1319. for rule in self._strategy:
  1320. self._current_chartrule = rule
  1321. for e in rule.apply_everywhere(chart, grammar):
  1322. edges_added += 1
  1323. yield e
  1324. # ////////////////////////////////////////////////////////////
  1325. # Accessors
  1326. # ////////////////////////////////////////////////////////////
  1327. def strategy(self):
  1328. "Return the strategy used by this parser."
  1329. return self._strategy
  1330. def grammar(self):
  1331. "Return the grammar used by this parser."
  1332. return self._grammar
  1333. def chart(self):
  1334. "Return the chart that is used by this parser."
  1335. return self._chart
  1336. def current_chartrule(self):
  1337. "Return the chart rule used to generate the most recent edge."
  1338. return self._current_chartrule
  1339. def parses(self, tree_class=Tree):
  1340. "Return the parse trees currently contained in the chart."
  1341. return self._chart.parses(self._grammar.start(), tree_class)
  1342. # ////////////////////////////////////////////////////////////
  1343. # Parser modification
  1344. # ////////////////////////////////////////////////////////////
  1345. def set_strategy(self, strategy):
  1346. """
  1347. Change the strategy that the parser uses to decide which edges
  1348. to add to the chart.
  1349. :type strategy: list(ChartRuleI)
  1350. :param strategy: A list of rules that should be used to decide
  1351. what edges to add to the chart.
  1352. """
  1353. if strategy == self._strategy:
  1354. return
  1355. self._strategy = strategy[:] # Make a copy.
  1356. self._restart = True
  1357. def set_grammar(self, grammar):
  1358. "Change the grammar used by the parser."
  1359. if grammar is self._grammar:
  1360. return
  1361. self._grammar = grammar
  1362. self._restart = True
  1363. def set_chart(self, chart):
  1364. "Load a given chart into the chart parser."
  1365. if chart is self._chart:
  1366. return
  1367. self._chart = chart
  1368. self._restart = True
  1369. # ////////////////////////////////////////////////////////////
  1370. # Standard parser methods
  1371. # ////////////////////////////////////////////////////////////
  1372. def parse(self, tokens, tree_class=Tree):
  1373. tokens = list(tokens)
  1374. self._grammar.check_coverage(tokens)
  1375. # Initialize ourselves.
  1376. self.initialize(tokens)
  1377. # Step until no more edges are generated.
  1378. for e in self.step():
  1379. if e is None:
  1380. break
  1381. # Return an iterator of complete parses.
  1382. return self.parses(tree_class=tree_class)
  1383. ########################################################################
  1384. ## Demo Code
  1385. ########################################################################
  1386. def demo_grammar():
  1387. from nltk.grammar import CFG
  1388. return CFG.fromstring(
  1389. """
  1390. S -> NP VP
  1391. PP -> "with" NP
  1392. NP -> NP PP
  1393. VP -> VP PP
  1394. VP -> Verb NP
  1395. VP -> Verb
  1396. NP -> Det Noun
  1397. NP -> "John"
  1398. NP -> "I"
  1399. Det -> "the"
  1400. Det -> "my"
  1401. Det -> "a"
  1402. Noun -> "dog"
  1403. Noun -> "cookie"
  1404. Verb -> "ate"
  1405. Verb -> "saw"
  1406. Prep -> "with"
  1407. Prep -> "under"
  1408. """
  1409. )
  1410. def demo(
  1411. choice=None,
  1412. print_times=True,
  1413. print_grammar=False,
  1414. print_trees=True,
  1415. trace=2,
  1416. sent='I saw John with a dog with my cookie',
  1417. numparses=5,
  1418. ):
  1419. """
  1420. A demonstration of the chart parsers.
  1421. """
  1422. import sys, time
  1423. from nltk import nonterminals, Production, CFG
  1424. # The grammar for ChartParser and SteppingChartParser:
  1425. grammar = demo_grammar()
  1426. if print_grammar:
  1427. print("* Grammar")
  1428. print(grammar)
  1429. # Tokenize the sample sentence.
  1430. print("* Sentence:")
  1431. print(sent)
  1432. tokens = sent.split()
  1433. print(tokens)
  1434. print()
  1435. # Ask the user which parser to test,
  1436. # if the parser wasn't provided as an argument
  1437. if choice is None:
  1438. print(' 1: Top-down chart parser')
  1439. print(' 2: Bottom-up chart parser')
  1440. print(' 3: Bottom-up left-corner chart parser')
  1441. print(' 4: Left-corner chart parser with bottom-up filter')
  1442. print(' 5: Stepping chart parser (alternating top-down & bottom-up)')
  1443. print(' 6: All parsers')
  1444. print('\nWhich parser (1-6)? ', end=' ')
  1445. choice = sys.stdin.readline().strip()
  1446. print()
  1447. choice = str(choice)
  1448. if choice not in "123456":
  1449. print('Bad parser number')
  1450. return
  1451. # Keep track of how long each parser takes.
  1452. times = {}
  1453. strategies = {
  1454. '1': ('Top-down', TD_STRATEGY),
  1455. '2': ('Bottom-up', BU_STRATEGY),
  1456. '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
  1457. '4': ('Filtered left-corner', LC_STRATEGY),
  1458. }
  1459. choices = []
  1460. if choice in strategies:
  1461. choices = [choice]
  1462. if choice == '6':
  1463. choices = "1234"
  1464. # Run the requested chart parser(s), except the stepping parser.
  1465. for strategy in choices:
  1466. print("* Strategy: " + strategies[strategy][0])
  1467. print()
  1468. cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
  1469. t = time.time()
  1470. chart = cp.chart_parse(tokens)
  1471. parses = list(chart.parses(grammar.start()))
  1472. times[strategies[strategy][0]] = time.time() - t
  1473. print("Nr edges in chart:", len(chart.edges()))
  1474. if numparses:
  1475. assert len(parses) == numparses, 'Not all parses found'
  1476. if print_trees:
  1477. for tree in parses:
  1478. print(tree)
  1479. else:
  1480. print("Nr trees:", len(parses))
  1481. print()
  1482. # Run the stepping parser, if requested.
  1483. if choice in "56":
  1484. print("* Strategy: Stepping (top-down vs bottom-up)")
  1485. print()
  1486. t = time.time()
  1487. cp = SteppingChartParser(grammar, trace=trace)
  1488. cp.initialize(tokens)
  1489. for i in range(5):
  1490. print('*** SWITCH TO TOP DOWN')
  1491. cp.set_strategy(TD_STRATEGY)
  1492. for j, e in enumerate(cp.step()):
  1493. if j > 20 or e is None:
  1494. break
  1495. print('*** SWITCH TO BOTTOM UP')
  1496. cp.set_strategy(BU_STRATEGY)
  1497. for j, e in enumerate(cp.step()):
  1498. if j > 20 or e is None:
  1499. break
  1500. times['Stepping'] = time.time() - t
  1501. print("Nr edges in chart:", len(cp.chart().edges()))
  1502. if numparses:
  1503. assert len(list(cp.parses())) == numparses, 'Not all parses found'
  1504. if print_trees:
  1505. for tree in cp.parses():
  1506. print(tree)
  1507. else:
  1508. print("Nr trees:", len(list(cp.parses())))
  1509. print()
  1510. # Print the times of all parsers:
  1511. if not (print_times and times):
  1512. return
  1513. print("* Parsing times")
  1514. print()
  1515. maxlen = max(len(key) for key in times)
  1516. format = '%' + repr(maxlen) + 's parser: %6.3fsec'
  1517. times_items = times.items()
  1518. for (parser, t) in sorted(times_items, key=lambda a: a[1]):
  1519. print(format % (parser, t))
  1520. if __name__ == '__main__':
  1521. demo()