earleychart.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: An Incremental Earley Chart Parser
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
  6. # Rob Speer <rspeer@mit.edu>
  7. # Edward Loper <edloper@gmail.com>
  8. # Steven Bird <stevenbird1@gmail.com>
  9. # Jean Mark Gawron <gawron@mail.sdsu.edu>
  10. # URL: <http://nltk.org/>
  11. # For license information, see LICENSE.TXT
  12. """
  13. Data classes and parser implementations for *incremental* chart
  14. parsers, which use dynamic programming to efficiently parse a text.
  15. A "chart parser" derives parse trees for a text by iteratively adding
  16. \"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree
  17. structure for a subsequence of the text. The "chart" is a
  18. \"blackboard\" for composing and combining these hypotheses.
  19. A parser is "incremental", if it guarantees that for all i, j where i < j,
  20. all edges ending at i are built before any edges ending at j.
  21. This is appealing for, say, speech recognizer hypothesis filtering.
  22. The main parser class is ``EarleyChartParser``, which is a top-down
  23. algorithm, originally formulated by Jay Earley (1970).
  24. """
  25. from __future__ import print_function, division
  26. from six.moves import range
  27. from nltk.parse.chart import (
  28. Chart,
  29. ChartParser,
  30. EdgeI,
  31. LeafEdge,
  32. LeafInitRule,
  33. BottomUpPredictRule,
  34. BottomUpPredictCombineRule,
  35. TopDownInitRule,
  36. SingleEdgeFundamentalRule,
  37. EmptyPredictRule,
  38. CachedTopDownPredictRule,
  39. FilteredSingleEdgeFundamentalRule,
  40. FilteredBottomUpPredictCombineRule,
  41. )
  42. from nltk.parse.featurechart import (
  43. FeatureChart,
  44. FeatureChartParser,
  45. FeatureTopDownInitRule,
  46. FeatureTopDownPredictRule,
  47. FeatureEmptyPredictRule,
  48. FeatureBottomUpPredictRule,
  49. FeatureBottomUpPredictCombineRule,
  50. FeatureSingleEdgeFundamentalRule,
  51. )
  52. # ////////////////////////////////////////////////////////////
  53. # Incremental Chart
  54. # ////////////////////////////////////////////////////////////
  55. class IncrementalChart(Chart):
  56. def initialize(self):
  57. # A sequence of edge lists contained in this chart.
  58. self._edgelists = tuple([] for x in self._positions())
  59. # The set of child pointer lists associated with each edge.
  60. self._edge_to_cpls = {}
  61. # Indexes mapping attribute values to lists of edges
  62. # (used by select()).
  63. self._indexes = {}
  64. def edges(self):
  65. return list(self.iteredges())
  66. def iteredges(self):
  67. return (edge for edgelist in self._edgelists for edge in edgelist)
  68. def select(self, end, **restrictions):
  69. edgelist = self._edgelists[end]
  70. # If there are no restrictions, then return all edges.
  71. if restrictions == {}:
  72. return iter(edgelist)
  73. # Find the index corresponding to the given restrictions.
  74. restr_keys = sorted(restrictions.keys())
  75. restr_keys = tuple(restr_keys)
  76. # If it doesn't exist, then create it.
  77. if restr_keys not in self._indexes:
  78. self._add_index(restr_keys)
  79. vals = tuple(restrictions[key] for key in restr_keys)
  80. return iter(self._indexes[restr_keys][end].get(vals, []))
  81. def _add_index(self, restr_keys):
  82. # Make sure it's a valid index.
  83. for key in restr_keys:
  84. if not hasattr(EdgeI, key):
  85. raise ValueError('Bad restriction: %s' % key)
  86. # Create the index.
  87. index = self._indexes[restr_keys] = tuple({} for x in self._positions())
  88. # Add all existing edges to the index.
  89. for end, edgelist in enumerate(self._edgelists):
  90. this_index = index[end]
  91. for edge in edgelist:
  92. vals = tuple(getattr(edge, key)() for key in restr_keys)
  93. this_index.setdefault(vals, []).append(edge)
  94. def _register_with_indexes(self, edge):
  95. end = edge.end()
  96. for (restr_keys, index) in self._indexes.items():
  97. vals = tuple(getattr(edge, key)() for key in restr_keys)
  98. index[end].setdefault(vals, []).append(edge)
  99. def _append_edge(self, edge):
  100. self._edgelists[edge.end()].append(edge)
  101. def _positions(self):
  102. return range(self.num_leaves() + 1)
  103. class FeatureIncrementalChart(IncrementalChart, FeatureChart):
  104. def select(self, end, **restrictions):
  105. edgelist = self._edgelists[end]
  106. # If there are no restrictions, then return all edges.
  107. if restrictions == {}:
  108. return iter(edgelist)
  109. # Find the index corresponding to the given restrictions.
  110. restr_keys = sorted(restrictions.keys())
  111. restr_keys = tuple(restr_keys)
  112. # If it doesn't exist, then create it.
  113. if restr_keys not in self._indexes:
  114. self._add_index(restr_keys)
  115. vals = tuple(
  116. self._get_type_if_possible(restrictions[key]) for key in restr_keys
  117. )
  118. return iter(self._indexes[restr_keys][end].get(vals, []))
  119. def _add_index(self, restr_keys):
  120. # Make sure it's a valid index.
  121. for key in restr_keys:
  122. if not hasattr(EdgeI, key):
  123. raise ValueError('Bad restriction: %s' % key)
  124. # Create the index.
  125. index = self._indexes[restr_keys] = tuple({} for x in self._positions())
  126. # Add all existing edges to the index.
  127. for end, edgelist in enumerate(self._edgelists):
  128. this_index = index[end]
  129. for edge in edgelist:
  130. vals = tuple(
  131. self._get_type_if_possible(getattr(edge, key)())
  132. for key in restr_keys
  133. )
  134. this_index.setdefault(vals, []).append(edge)
  135. def _register_with_indexes(self, edge):
  136. end = edge.end()
  137. for (restr_keys, index) in self._indexes.items():
  138. vals = tuple(
  139. self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys
  140. )
  141. index[end].setdefault(vals, []).append(edge)
  142. # ////////////////////////////////////////////////////////////
  143. # Incremental CFG Rules
  144. # ////////////////////////////////////////////////////////////
  145. class CompleteFundamentalRule(SingleEdgeFundamentalRule):
  146. def _apply_incomplete(self, chart, grammar, left_edge):
  147. end = left_edge.end()
  148. # When the chart is incremental, we only have to look for
  149. # empty complete edges here.
  150. for right_edge in chart.select(
  151. start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
  152. ):
  153. new_edge = left_edge.move_dot_forward(right_edge.end())
  154. if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
  155. yield new_edge
  156. class CompleterRule(CompleteFundamentalRule):
  157. _fundamental_rule = CompleteFundamentalRule()
  158. def apply(self, chart, grammar, edge):
  159. if not isinstance(edge, LeafEdge):
  160. for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
  161. yield new_edge
  162. class ScannerRule(CompleteFundamentalRule):
  163. _fundamental_rule = CompleteFundamentalRule()
  164. def apply(self, chart, grammar, edge):
  165. if isinstance(edge, LeafEdge):
  166. for new_edge in self._fundamental_rule.apply(chart, grammar, edge):
  167. yield new_edge
  168. class PredictorRule(CachedTopDownPredictRule):
  169. pass
  170. class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule):
  171. def apply(self, chart, grammar, edge):
  172. # Since the Filtered rule only works for grammars without empty productions,
  173. # we only have to bother with complete edges here.
  174. if edge.is_complete():
  175. for new_edge in self._apply_complete(chart, grammar, edge):
  176. yield new_edge
  177. # ////////////////////////////////////////////////////////////
  178. # Incremental FCFG Rules
  179. # ////////////////////////////////////////////////////////////
  180. class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule):
  181. def _apply_incomplete(self, chart, grammar, left_edge):
  182. fr = self._fundamental_rule
  183. end = left_edge.end()
  184. # When the chart is incremental, we only have to look for
  185. # empty complete edges here.
  186. for right_edge in chart.select(
  187. start=end, end=end, is_complete=True, lhs=left_edge.nextsym()
  188. ):
  189. for new_edge in fr.apply(chart, grammar, left_edge, right_edge):
  190. yield new_edge
  191. class FeatureCompleterRule(CompleterRule):
  192. _fundamental_rule = FeatureCompleteFundamentalRule()
  193. class FeatureScannerRule(ScannerRule):
  194. _fundamental_rule = FeatureCompleteFundamentalRule()
  195. class FeaturePredictorRule(FeatureTopDownPredictRule):
  196. pass
  197. # ////////////////////////////////////////////////////////////
  198. # Incremental CFG Chart Parsers
  199. # ////////////////////////////////////////////////////////////
  200. EARLEY_STRATEGY = [
  201. LeafInitRule(),
  202. TopDownInitRule(),
  203. CompleterRule(),
  204. ScannerRule(),
  205. PredictorRule(),
  206. ]
  207. TD_INCREMENTAL_STRATEGY = [
  208. LeafInitRule(),
  209. TopDownInitRule(),
  210. CachedTopDownPredictRule(),
  211. CompleteFundamentalRule(),
  212. ]
  213. BU_INCREMENTAL_STRATEGY = [
  214. LeafInitRule(),
  215. EmptyPredictRule(),
  216. BottomUpPredictRule(),
  217. CompleteFundamentalRule(),
  218. ]
  219. BU_LC_INCREMENTAL_STRATEGY = [
  220. LeafInitRule(),
  221. EmptyPredictRule(),
  222. BottomUpPredictCombineRule(),
  223. CompleteFundamentalRule(),
  224. ]
  225. LC_INCREMENTAL_STRATEGY = [
  226. LeafInitRule(),
  227. FilteredBottomUpPredictCombineRule(),
  228. FilteredCompleteFundamentalRule(),
  229. ]
  230. class IncrementalChartParser(ChartParser):
  231. """
  232. An *incremental* chart parser implementing Jay Earley's
  233. parsing algorithm:
  234. | For each index end in [0, 1, ..., N]:
  235. | For each edge such that edge.end = end:
  236. | If edge is incomplete and edge.next is not a part of speech:
  237. | Apply PredictorRule to edge
  238. | If edge is incomplete and edge.next is a part of speech:
  239. | Apply ScannerRule to edge
  240. | If edge is complete:
  241. | Apply CompleterRule to edge
  242. | Return any complete parses in the chart
  243. """
  244. def __init__(
  245. self,
  246. grammar,
  247. strategy=BU_LC_INCREMENTAL_STRATEGY,
  248. trace=0,
  249. trace_chart_width=50,
  250. chart_class=IncrementalChart,
  251. ):
  252. """
  253. Create a new Earley chart parser, that uses ``grammar`` to
  254. parse texts.
  255. :type grammar: CFG
  256. :param grammar: The grammar used to parse texts.
  257. :type trace: int
  258. :param trace: The level of tracing that should be used when
  259. parsing a text. ``0`` will generate no tracing output;
  260. and higher numbers will produce more verbose tracing
  261. output.
  262. :type trace_chart_width: int
  263. :param trace_chart_width: The default total width reserved for
  264. the chart in trace output. The remainder of each line will
  265. be used to display edges.
  266. :param chart_class: The class that should be used to create
  267. the charts used by this parser.
  268. """
  269. self._grammar = grammar
  270. self._trace = trace
  271. self._trace_chart_width = trace_chart_width
  272. self._chart_class = chart_class
  273. self._axioms = []
  274. self._inference_rules = []
  275. for rule in strategy:
  276. if rule.NUM_EDGES == 0:
  277. self._axioms.append(rule)
  278. elif rule.NUM_EDGES == 1:
  279. self._inference_rules.append(rule)
  280. else:
  281. raise ValueError(
  282. "Incremental inference rules must have " "NUM_EDGES == 0 or 1"
  283. )
  284. def chart_parse(self, tokens, trace=None):
  285. if trace is None:
  286. trace = self._trace
  287. trace_new_edges = self._trace_new_edges
  288. tokens = list(tokens)
  289. self._grammar.check_coverage(tokens)
  290. chart = self._chart_class(tokens)
  291. grammar = self._grammar
  292. # Width, for printing trace edges.
  293. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
  294. if trace:
  295. print(chart.pretty_format_leaves(trace_edge_width))
  296. for axiom in self._axioms:
  297. new_edges = list(axiom.apply(chart, grammar))
  298. trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
  299. inference_rules = self._inference_rules
  300. for end in range(chart.num_leaves() + 1):
  301. if trace > 1:
  302. print("\n* Processing queue:", end, "\n")
  303. agenda = list(chart.select(end=end))
  304. while agenda:
  305. edge = agenda.pop()
  306. for rule in inference_rules:
  307. new_edges = list(rule.apply(chart, grammar, edge))
  308. trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
  309. for new_edge in new_edges:
  310. if new_edge.end() == end:
  311. agenda.append(new_edge)
  312. return chart
  313. class EarleyChartParser(IncrementalChartParser):
  314. def __init__(self, grammar, **parser_args):
  315. IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args)
  316. class IncrementalTopDownChartParser(IncrementalChartParser):
  317. def __init__(self, grammar, **parser_args):
  318. IncrementalChartParser.__init__(
  319. self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args
  320. )
  321. class IncrementalBottomUpChartParser(IncrementalChartParser):
  322. def __init__(self, grammar, **parser_args):
  323. IncrementalChartParser.__init__(
  324. self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args
  325. )
  326. class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser):
  327. def __init__(self, grammar, **parser_args):
  328. IncrementalChartParser.__init__(
  329. self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args
  330. )
  331. class IncrementalLeftCornerChartParser(IncrementalChartParser):
  332. def __init__(self, grammar, **parser_args):
  333. if not grammar.is_nonempty():
  334. raise ValueError(
  335. "IncrementalLeftCornerParser only works for grammars "
  336. "without empty productions."
  337. )
  338. IncrementalChartParser.__init__(
  339. self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args
  340. )
  341. # ////////////////////////////////////////////////////////////
  342. # Incremental FCFG Chart Parsers
  343. # ////////////////////////////////////////////////////////////
  344. EARLEY_FEATURE_STRATEGY = [
  345. LeafInitRule(),
  346. FeatureTopDownInitRule(),
  347. FeatureCompleterRule(),
  348. FeatureScannerRule(),
  349. FeaturePredictorRule(),
  350. ]
  351. TD_INCREMENTAL_FEATURE_STRATEGY = [
  352. LeafInitRule(),
  353. FeatureTopDownInitRule(),
  354. FeatureTopDownPredictRule(),
  355. FeatureCompleteFundamentalRule(),
  356. ]
  357. BU_INCREMENTAL_FEATURE_STRATEGY = [
  358. LeafInitRule(),
  359. FeatureEmptyPredictRule(),
  360. FeatureBottomUpPredictRule(),
  361. FeatureCompleteFundamentalRule(),
  362. ]
  363. BU_LC_INCREMENTAL_FEATURE_STRATEGY = [
  364. LeafInitRule(),
  365. FeatureEmptyPredictRule(),
  366. FeatureBottomUpPredictCombineRule(),
  367. FeatureCompleteFundamentalRule(),
  368. ]
  369. class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser):
  370. def __init__(
  371. self,
  372. grammar,
  373. strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY,
  374. trace_chart_width=20,
  375. chart_class=FeatureIncrementalChart,
  376. **parser_args
  377. ):
  378. IncrementalChartParser.__init__(
  379. self,
  380. grammar,
  381. strategy=strategy,
  382. trace_chart_width=trace_chart_width,
  383. chart_class=chart_class,
  384. **parser_args
  385. )
  386. class FeatureEarleyChartParser(FeatureIncrementalChartParser):
  387. def __init__(self, grammar, **parser_args):
  388. FeatureIncrementalChartParser.__init__(
  389. self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args
  390. )
  391. class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser):
  392. def __init__(self, grammar, **parser_args):
  393. FeatureIncrementalChartParser.__init__(
  394. self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args
  395. )
  396. class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser):
  397. def __init__(self, grammar, **parser_args):
  398. FeatureIncrementalChartParser.__init__(
  399. self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args
  400. )
  401. class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser):
  402. def __init__(self, grammar, **parser_args):
  403. FeatureIncrementalChartParser.__init__(
  404. self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args
  405. )
  406. # ////////////////////////////////////////////////////////////
  407. # Demonstration
  408. # ////////////////////////////////////////////////////////////
  409. def demo(
  410. print_times=True,
  411. print_grammar=False,
  412. print_trees=True,
  413. trace=2,
  414. sent='I saw John with a dog with my cookie',
  415. numparses=5,
  416. ):
  417. """
  418. A demonstration of the Earley parsers.
  419. """
  420. import sys, time
  421. from nltk.parse.chart import demo_grammar
  422. # The grammar for ChartParser and SteppingChartParser:
  423. grammar = demo_grammar()
  424. if print_grammar:
  425. print("* Grammar")
  426. print(grammar)
  427. # Tokenize the sample sentence.
  428. print("* Sentence:")
  429. print(sent)
  430. tokens = sent.split()
  431. print(tokens)
  432. print()
  433. # Do the parsing.
  434. earley = EarleyChartParser(grammar, trace=trace)
  435. t = time.clock()
  436. chart = earley.chart_parse(tokens)
  437. parses = list(chart.parses(grammar.start()))
  438. t = time.clock() - t
  439. # Print results.
  440. if numparses:
  441. assert len(parses) == numparses, 'Not all parses found'
  442. if print_trees:
  443. for tree in parses:
  444. print(tree)
  445. else:
  446. print("Nr trees:", len(parses))
  447. if print_times:
  448. print("Time:", t)
  449. if __name__ == '__main__':
  450. demo()