chart.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. # Natural Language Toolkit: Combinatory Categorial Grammar
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Graeme Gange <ggange@csse.unimelb.edu.au>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. The lexicon is constructed by calling
  9. ``lexicon.fromstring(<lexicon string>)``.
  10. In order to construct a parser, you also need a rule set.
  11. The standard English rules are provided in chart as
  12. ``chart.DefaultRuleSet``.
  13. The parser can then be constructed by calling, for example:
  14. ``parser = chart.CCGChartParser(<lexicon>, <ruleset>)``
  15. Parsing is then performed by running
  16. ``parser.parse(<sentence>.split())``.
  17. While this returns a list of trees, the default representation
  18. of the produced trees is not very enlightening, particularly
  19. given that it uses the same tree class as the CFG parsers.
  20. It is probably better to call:
  21. ``chart.printCCGDerivation(<parse tree extracted from list>)``
  22. which should print a nice representation of the derivation.
  23. This entire process is shown far more clearly in the demonstration:
  24. python chart.py
  25. """
  26. from __future__ import print_function, division, unicode_literals
  27. import itertools
  28. from six import string_types
  29. from nltk.parse import ParserI
  30. from nltk.parse.chart import AbstractChartRule, EdgeI, Chart
  31. from nltk.tree import Tree
  32. from nltk.ccg.lexicon import fromstring, Token
  33. from nltk.ccg.combinator import (
  34. ForwardT,
  35. BackwardT,
  36. ForwardApplication,
  37. BackwardApplication,
  38. ForwardComposition,
  39. BackwardComposition,
  40. ForwardSubstitution,
  41. BackwardBx,
  42. BackwardSx,
  43. )
  44. from nltk.compat import python_2_unicode_compatible
  45. from nltk.ccg.combinator import *
  46. from nltk.ccg.logic import *
  47. from nltk.sem.logic import *
  48. # Based on the EdgeI class from NLTK.
  49. # A number of the properties of the EdgeI interface don't
  50. # transfer well to CCGs, however.
  51. class CCGEdge(EdgeI):
  52. def __init__(self, span, categ, rule):
  53. self._span = span
  54. self._categ = categ
  55. self._rule = rule
  56. self._comparison_key = (span, categ, rule)
  57. # Accessors
  58. def lhs(self):
  59. return self._categ
  60. def span(self):
  61. return self._span
  62. def start(self):
  63. return self._span[0]
  64. def end(self):
  65. return self._span[1]
  66. def length(self):
  67. return self._span[1] - self.span[0]
  68. def rhs(self):
  69. return ()
  70. def dot(self):
  71. return 0
  72. def is_complete(self):
  73. return True
  74. def is_incomplete(self):
  75. return False
  76. def nextsym(self):
  77. return None
  78. def categ(self):
  79. return self._categ
  80. def rule(self):
  81. return self._rule
  82. class CCGLeafEdge(EdgeI):
  83. '''
  84. Class representing leaf edges in a CCG derivation.
  85. '''
  86. def __init__(self, pos, token, leaf):
  87. self._pos = pos
  88. self._token = token
  89. self._leaf = leaf
  90. self._comparison_key = (pos, token.categ(), leaf)
  91. # Accessors
  92. def lhs(self):
  93. return self._token.categ()
  94. def span(self):
  95. return (self._pos, self._pos + 1)
  96. def start(self):
  97. return self._pos
  98. def end(self):
  99. return self._pos + 1
  100. def length(self):
  101. return 1
  102. def rhs(self):
  103. return self._leaf
  104. def dot(self):
  105. return 0
  106. def is_complete(self):
  107. return True
  108. def is_incomplete(self):
  109. return False
  110. def nextsym(self):
  111. return None
  112. def token(self):
  113. return self._token
  114. def categ(self):
  115. return self._token.categ()
  116. def leaf(self):
  117. return self._leaf
  118. @python_2_unicode_compatible
  119. class BinaryCombinatorRule(AbstractChartRule):
  120. '''
  121. Class implementing application of a binary combinator to a chart.
  122. Takes the directed combinator to apply.
  123. '''
  124. NUMEDGES = 2
  125. def __init__(self, combinator):
  126. self._combinator = combinator
  127. # Apply a combinator
  128. def apply(self, chart, grammar, left_edge, right_edge):
  129. # The left & right edges must be touching.
  130. if not (left_edge.end() == right_edge.start()):
  131. return
  132. # Check if the two edges are permitted to combine.
  133. # If so, generate the corresponding edge.
  134. if self._combinator.can_combine(left_edge.categ(), right_edge.categ()):
  135. for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
  136. new_edge = CCGEdge(
  137. span=(left_edge.start(), right_edge.end()),
  138. categ=res,
  139. rule=self._combinator,
  140. )
  141. if chart.insert(new_edge, (left_edge, right_edge)):
  142. yield new_edge
  143. # The representation of the combinator (for printing derivations)
  144. def __str__(self):
  145. return "%s" % self._combinator
  146. # Type-raising must be handled slightly differently to the other rules, as the
  147. # resulting rules only span a single edge, rather than both edges.
  148. @python_2_unicode_compatible
  149. class ForwardTypeRaiseRule(AbstractChartRule):
  150. '''
  151. Class for applying forward type raising
  152. '''
  153. NUMEDGES = 2
  154. def __init__(self):
  155. self._combinator = ForwardT
  156. def apply(self, chart, grammar, left_edge, right_edge):
  157. if not (left_edge.end() == right_edge.start()):
  158. return
  159. for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
  160. new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator)
  161. if chart.insert(new_edge, (left_edge,)):
  162. yield new_edge
  163. def __str__(self):
  164. return "%s" % self._combinator
  165. @python_2_unicode_compatible
  166. class BackwardTypeRaiseRule(AbstractChartRule):
  167. '''
  168. Class for applying backward type raising.
  169. '''
  170. NUMEDGES = 2
  171. def __init__(self):
  172. self._combinator = BackwardT
  173. def apply(self, chart, grammar, left_edge, right_edge):
  174. if not (left_edge.end() == right_edge.start()):
  175. return
  176. for res in self._combinator.combine(left_edge.categ(), right_edge.categ()):
  177. new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator)
  178. if chart.insert(new_edge, (right_edge,)):
  179. yield new_edge
  180. def __str__(self):
  181. return "%s" % self._combinator
  182. # Common sets of combinators used for English derivations.
  183. ApplicationRuleSet = [
  184. BinaryCombinatorRule(ForwardApplication),
  185. BinaryCombinatorRule(BackwardApplication),
  186. ]
  187. CompositionRuleSet = [
  188. BinaryCombinatorRule(ForwardComposition),
  189. BinaryCombinatorRule(BackwardComposition),
  190. BinaryCombinatorRule(BackwardBx),
  191. ]
  192. SubstitutionRuleSet = [
  193. BinaryCombinatorRule(ForwardSubstitution),
  194. BinaryCombinatorRule(BackwardSx),
  195. ]
  196. TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()]
  197. # The standard English rule set.
  198. DefaultRuleSet = (
  199. ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet
  200. )
  201. class CCGChartParser(ParserI):
  202. '''
  203. Chart parser for CCGs.
  204. Based largely on the ChartParser class from NLTK.
  205. '''
  206. def __init__(self, lexicon, rules, trace=0):
  207. self._lexicon = lexicon
  208. self._rules = rules
  209. self._trace = trace
  210. def lexicon(self):
  211. return self._lexicon
  212. # Implements the CYK algorithm
  213. def parse(self, tokens):
  214. tokens = list(tokens)
  215. chart = CCGChart(list(tokens))
  216. lex = self._lexicon
  217. # Initialize leaf edges.
  218. for index in range(chart.num_leaves()):
  219. for token in lex.categories(chart.leaf(index)):
  220. new_edge = CCGLeafEdge(index, token, chart.leaf(index))
  221. chart.insert(new_edge, ())
  222. # Select a span for the new edges
  223. for span in range(2, chart.num_leaves() + 1):
  224. for start in range(0, chart.num_leaves() - span + 1):
  225. # Try all possible pairs of edges that could generate
  226. # an edge for that span
  227. for part in range(1, span):
  228. lstart = start
  229. mid = start + part
  230. rend = start + span
  231. for left in chart.select(span=(lstart, mid)):
  232. for right in chart.select(span=(mid, rend)):
  233. # Generate all possible combinations of the two edges
  234. for rule in self._rules:
  235. edges_added_by_rule = 0
  236. for newedge in rule.apply(chart, lex, left, right):
  237. edges_added_by_rule += 1
  238. # Output the resulting parses
  239. return chart.parses(lex.start())
  240. class CCGChart(Chart):
  241. def __init__(self, tokens):
  242. Chart.__init__(self, tokens)
  243. # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be
  244. # constructed slightly differently to those in the default Chart class, so it has to
  245. # be reimplemented
  246. def _trees(self, edge, complete, memo, tree_class):
  247. assert complete, "CCGChart cannot build incomplete trees"
  248. if edge in memo:
  249. return memo[edge]
  250. if isinstance(edge, CCGLeafEdge):
  251. word = tree_class(edge.token(), [self._tokens[edge.start()]])
  252. leaf = tree_class((edge.token(), "Leaf"), [word])
  253. memo[edge] = [leaf]
  254. return [leaf]
  255. memo[edge] = []
  256. trees = []
  257. for cpl in self.child_pointer_lists(edge):
  258. child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
  259. for children in itertools.product(*child_choices):
  260. lhs = (
  261. Token(
  262. self._tokens[edge.start() : edge.end()],
  263. edge.lhs(),
  264. compute_semantics(children, edge),
  265. ),
  266. str(edge.rule()),
  267. )
  268. trees.append(tree_class(lhs, children))
  269. memo[edge] = trees
  270. return trees
  271. def compute_semantics(children, edge):
  272. if children[0].label()[0].semantics() is None:
  273. return None
  274. if len(children) == 2:
  275. if isinstance(edge.rule(), BackwardCombinator):
  276. children = [children[1], children[0]]
  277. combinator = edge.rule()._combinator
  278. function = children[0].label()[0].semantics()
  279. argument = children[1].label()[0].semantics()
  280. if isinstance(combinator, UndirectedFunctionApplication):
  281. return compute_function_semantics(function, argument)
  282. elif isinstance(combinator, UndirectedComposition):
  283. return compute_composition_semantics(function, argument)
  284. elif isinstance(combinator, UndirectedSubstitution):
  285. return compute_substitution_semantics(function, argument)
  286. else:
  287. raise AssertionError('Unsupported combinator \'' + combinator + '\'')
  288. else:
  289. return compute_type_raised_semantics(children[0].label()[0].semantics())
  290. # --------
  291. # Displaying derivations
  292. # --------
  293. def printCCGDerivation(tree):
  294. # Get the leaves and initial categories
  295. leafcats = tree.pos()
  296. leafstr = ''
  297. catstr = ''
  298. # Construct a string with both the leaf word and corresponding
  299. # category aligned.
  300. for (leaf, cat) in leafcats:
  301. str_cat = "%s" % cat
  302. nextlen = 2 + max(len(leaf), len(str_cat))
  303. lcatlen = (nextlen - len(str_cat)) // 2
  304. rcatlen = lcatlen + (nextlen - len(str_cat)) % 2
  305. catstr += ' ' * lcatlen + str_cat + ' ' * rcatlen
  306. lleaflen = (nextlen - len(leaf)) // 2
  307. rleaflen = lleaflen + (nextlen - len(leaf)) % 2
  308. leafstr += ' ' * lleaflen + leaf + ' ' * rleaflen
  309. print(leafstr.rstrip())
  310. print(catstr.rstrip())
  311. # Display the derivation steps
  312. printCCGTree(0, tree)
  313. # Prints the sequence of derivation steps.
  314. def printCCGTree(lwidth, tree):
  315. rwidth = lwidth
  316. # Is a leaf (word).
  317. # Increment the span by the space occupied by the leaf.
  318. if not isinstance(tree, Tree):
  319. return 2 + lwidth + len(tree)
  320. # Find the width of the current derivation step
  321. for child in tree:
  322. rwidth = max(rwidth, printCCGTree(rwidth, child))
  323. # Is a leaf node.
  324. # Don't print anything, but account for the space occupied.
  325. if not isinstance(tree.label(), tuple):
  326. return max(
  327. rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])
  328. )
  329. (token, op) = tree.label()
  330. if op == 'Leaf':
  331. return rwidth
  332. # Pad to the left with spaces, followed by a sequence of '-'
  333. # and the derivation rule.
  334. print(lwidth * ' ' + (rwidth - lwidth) * '-' + "%s" % op)
  335. # Print the resulting category on a new line.
  336. str_res = "%s" % (token.categ())
  337. if token.semantics() is not None:
  338. str_res += " {" + str(token.semantics()) + "}"
  339. respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth
  340. print(respadlen * ' ' + str_res)
  341. return rwidth
  342. ### Demonstration code
  343. # Construct the lexicon
  344. lex = fromstring(
  345. '''
  346. :- S, NP, N, VP # Primitive categories, S is the target primitive
  347. Det :: NP/N # Family of words
  348. Pro :: NP
  349. TV :: VP/NP
  350. Modal :: (S\\NP)/VP # Backslashes need to be escaped
  351. I => Pro # Word -> Category mapping
  352. you => Pro
  353. the => Det
  354. # Variables have the special keyword 'var'
  355. # '.' prevents permutation
  356. # ',' prevents composition
  357. and => var\\.,var/.,var
  358. which => (N\\N)/(S/NP)
  359. will => Modal # Categories can be either explicit, or families.
  360. might => Modal
  361. cook => TV
  362. eat => TV
  363. mushrooms => N
  364. parsnips => N
  365. bacon => N
  366. '''
  367. )
  368. def demo():
  369. parser = CCGChartParser(lex, DefaultRuleSet)
  370. for parse in parser.parse("I might cook and eat the bacon".split()):
  371. printCCGDerivation(parse)
  372. if __name__ == '__main__':
  373. demo()