boxer.py 53 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614
  1. # Natural Language Toolkit: Interface to Boxer
  2. # <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer>
  3. #
  4. # Author: Dan Garrette <dhgarrette@gmail.com>
  5. #
  6. # Copyright (C) 2001-2019 NLTK Project
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. An interface to Boxer.
  11. This interface relies on the latest version of the development (subversion) version of
  12. C&C and Boxer.
  13. Usage:
  14. Set the environment variable CANDC to the bin directory of your CandC installation.
  15. The models directory should be in the CandC root directory.
  16. For example:
  17. /path/to/candc/
  18. bin/
  19. candc
  20. boxer
  21. models/
  22. boxer/
  23. """
  24. from __future__ import print_function, unicode_literals
  25. import os
  26. import re
  27. import operator
  28. import subprocess
  29. from optparse import OptionParser
  30. import tempfile
  31. from functools import reduce
  32. from nltk.internals import find_binary
  33. from nltk.sem.logic import (
  34. ExpectedMoreTokensException,
  35. LogicalExpressionException,
  36. UnexpectedTokenException,
  37. Variable,
  38. )
  39. from nltk.sem.drt import (
  40. DRS,
  41. DrtApplicationExpression,
  42. DrtEqualityExpression,
  43. DrtNegatedExpression,
  44. DrtOrExpression,
  45. DrtParser,
  46. DrtProposition,
  47. DrtTokens,
  48. DrtVariableExpression,
  49. )
  50. from nltk.compat import python_2_unicode_compatible
  51. class Boxer(object):
  52. """
  53. This class is an interface to Johan Bos's program Boxer, a wide-coverage
  54. semantic parser that produces Discourse Representation Structures (DRSs).
  55. """
  56. def __init__(
  57. self,
  58. boxer_drs_interpreter=None,
  59. elimeq=False,
  60. bin_dir=None,
  61. verbose=False,
  62. resolve=True,
  63. ):
  64. """
  65. :param boxer_drs_interpreter: A class that converts from the
  66. ``AbstractBoxerDrs`` object hierarchy to a different object. The
  67. default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK
  68. DRT hierarchy.
  69. :param elimeq: When set to true, Boxer removes all equalities from the
  70. DRSs and discourse referents standing in the equality relation are
  71. unified, but only if this can be done in a meaning-preserving manner.
  72. :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction.
  73. Resolution follows Van der Sandt's theory of binding and accommodation.
  74. """
  75. if boxer_drs_interpreter is None:
  76. boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter()
  77. self._boxer_drs_interpreter = boxer_drs_interpreter
  78. self._resolve = resolve
  79. self._elimeq = elimeq
  80. self.set_bin_dir(bin_dir, verbose)
  81. def set_bin_dir(self, bin_dir, verbose=False):
  82. self._candc_bin = self._find_binary('candc', bin_dir, verbose)
  83. self._candc_models_path = os.path.normpath(
  84. os.path.join(self._candc_bin[:-5], '../models')
  85. )
  86. self._boxer_bin = self._find_binary('boxer', bin_dir, verbose)
  87. def interpret(self, input, discourse_id=None, question=False, verbose=False):
  88. """
  89. Use Boxer to give a first order representation.
  90. :param input: str Input sentence to parse
  91. :param occur_index: bool Should predicates be occurrence indexed?
  92. :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
  93. :return: ``drt.DrtExpression``
  94. """
  95. discourse_ids = [discourse_id] if discourse_id is not None else None
  96. d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose)
  97. if not d:
  98. raise Exception('Unable to interpret: "{0}"'.format(input))
  99. return d
  100. def interpret_multi(self, input, discourse_id=None, question=False, verbose=False):
  101. """
  102. Use Boxer to give a first order representation.
  103. :param input: list of str Input sentences to parse as a single discourse
  104. :param occur_index: bool Should predicates be occurrence indexed?
  105. :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate.
  106. :return: ``drt.DrtExpression``
  107. """
  108. discourse_ids = [discourse_id] if discourse_id is not None else None
  109. d, = self.interpret_multi_sents([input], discourse_ids, question, verbose)
  110. if not d:
  111. raise Exception('Unable to interpret: "{0}"'.format(input))
  112. return d
  113. def interpret_sents(
  114. self, inputs, discourse_ids=None, question=False, verbose=False
  115. ):
  116. """
  117. Use Boxer to give a first order representation.
  118. :param inputs: list of str Input sentences to parse as individual discourses
  119. :param occur_index: bool Should predicates be occurrence indexed?
  120. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
  121. :return: list of ``drt.DrtExpression``
  122. """
  123. return self.interpret_multi_sents(
  124. [[input] for input in inputs], discourse_ids, question, verbose
  125. )
  126. def interpret_multi_sents(
  127. self, inputs, discourse_ids=None, question=False, verbose=False
  128. ):
  129. """
  130. Use Boxer to give a first order representation.
  131. :param inputs: list of list of str Input discourses to parse
  132. :param occur_index: bool Should predicates be occurrence indexed?
  133. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
  134. :return: ``drt.DrtExpression``
  135. """
  136. if discourse_ids is not None:
  137. assert len(inputs) == len(discourse_ids)
  138. assert reduce(operator.and_, (id is not None for id in discourse_ids))
  139. use_disc_id = True
  140. else:
  141. discourse_ids = list(map(str, range(len(inputs))))
  142. use_disc_id = False
  143. candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose)
  144. boxer_out = self._call_boxer(candc_out, verbose=verbose)
  145. # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out:
  146. # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str)
  147. drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id)
  148. return [drs_dict.get(id, None) for id in discourse_ids]
  149. def _call_candc(self, inputs, discourse_ids, question, verbose=False):
  150. """
  151. Call the ``candc`` binary with the given input.
  152. :param inputs: list of list of str Input discourses to parse
  153. :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate.
  154. :param filename: str A filename for the output file
  155. :return: stdout
  156. """
  157. args = [
  158. '--models',
  159. os.path.join(self._candc_models_path, ['boxer', 'questions'][question]),
  160. '--candc-printer',
  161. 'boxer',
  162. ]
  163. return self._call(
  164. '\n'.join(
  165. sum(
  166. (
  167. ["<META>'{0}'".format(id)] + d
  168. for d, id in zip(inputs, discourse_ids)
  169. ),
  170. [],
  171. )
  172. ),
  173. self._candc_bin,
  174. args,
  175. verbose,
  176. )
  177. def _call_boxer(self, candc_out, verbose=False):
  178. """
  179. Call the ``boxer`` binary with the given input.
  180. :param candc_out: str output from C&C parser
  181. :return: stdout
  182. """
  183. f = None
  184. try:
  185. fd, temp_filename = tempfile.mkstemp(
  186. prefix='boxer-', suffix='.in', text=True
  187. )
  188. f = os.fdopen(fd, 'w')
  189. f.write(candc_out)
  190. finally:
  191. if f:
  192. f.close()
  193. args = [
  194. '--box',
  195. 'false',
  196. '--semantics',
  197. 'drs',
  198. #'--flat', 'false', # removed from boxer
  199. '--resolve',
  200. ['false', 'true'][self._resolve],
  201. '--elimeq',
  202. ['false', 'true'][self._elimeq],
  203. '--format',
  204. 'prolog',
  205. '--instantiate',
  206. 'true',
  207. '--input',
  208. temp_filename,
  209. ]
  210. stdout = self._call(None, self._boxer_bin, args, verbose)
  211. os.remove(temp_filename)
  212. return stdout
  213. def _find_binary(self, name, bin_dir, verbose=False):
  214. return find_binary(
  215. name,
  216. path_to_bin=bin_dir,
  217. env_vars=['CANDC'],
  218. url='http://svn.ask.it.usyd.edu.au/trac/candc/',
  219. binary_names=[name, name + '.exe'],
  220. verbose=verbose,
  221. )
  222. def _call(self, input_str, binary, args=[], verbose=False):
  223. """
  224. Call the binary with the given input.
  225. :param input_str: A string whose contents are used as stdin.
  226. :param binary: The location of the binary to call
  227. :param args: A list of command-line arguments.
  228. :return: stdout
  229. """
  230. if verbose:
  231. print('Calling:', binary)
  232. print('Args:', args)
  233. print('Input:', input_str)
  234. print('Command:', binary + ' ' + ' '.join(args))
  235. # Call via a subprocess
  236. if input_str is None:
  237. cmd = [binary] + args
  238. p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  239. else:
  240. cmd = 'echo "{0}" | {1} {2}'.format(input_str, binary, ' '.join(args))
  241. p = subprocess.Popen(
  242. cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
  243. )
  244. stdout, stderr = p.communicate()
  245. if verbose:
  246. print('Return code:', p.returncode)
  247. if stdout:
  248. print('stdout:\n', stdout, '\n')
  249. if stderr:
  250. print('stderr:\n', stderr, '\n')
  251. if p.returncode != 0:
  252. raise Exception(
  253. 'ERROR CALLING: {0} {1}\nReturncode: {2}\n{3}'.format(
  254. binary, ' '.join(args), p.returncode, stderr
  255. )
  256. )
  257. return stdout
  258. def _parse_to_drs_dict(self, boxer_out, use_disc_id):
  259. lines = boxer_out.split('\n')
  260. drs_dict = {}
  261. i = 0
  262. while i < len(lines):
  263. line = lines[i]
  264. if line.startswith('id('):
  265. comma_idx = line.index(',')
  266. discourse_id = line[3:comma_idx]
  267. if discourse_id[0] == "'" and discourse_id[-1] == "'":
  268. discourse_id = discourse_id[1:-1]
  269. drs_id = line[comma_idx + 1 : line.index(')')]
  270. i += 1
  271. line = lines[i]
  272. assert line.startswith('sem({0},'.format(drs_id))
  273. if line[-4:] == "').'":
  274. line = line[:-4] + ")."
  275. assert line.endswith(').'), "can't parse line: {0}".format(line)
  276. search_start = len('sem({0},['.format(drs_id))
  277. brace_count = 1
  278. drs_start = -1
  279. for j, c in enumerate(line[search_start:]):
  280. if c == '[':
  281. brace_count += 1
  282. if c == ']':
  283. brace_count -= 1
  284. if brace_count == 0:
  285. drs_start = search_start + j + 1
  286. if line[drs_start : drs_start + 3] == "','":
  287. drs_start = drs_start + 3
  288. else:
  289. drs_start = drs_start + 1
  290. break
  291. assert drs_start > -1
  292. drs_input = line[drs_start:-2].strip()
  293. parsed = self._parse_drs(drs_input, discourse_id, use_disc_id)
  294. drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed)
  295. i += 1
  296. return drs_dict
  297. def _parse_drs(self, drs_string, discourse_id, use_disc_id):
  298. return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string)
  299. class BoxerOutputDrsParser(DrtParser):
  300. def __init__(self, discourse_id=None):
  301. """
  302. This class is used to parse the Prolog DRS output from Boxer into a
  303. hierarchy of python objects.
  304. """
  305. DrtParser.__init__(self)
  306. self.discourse_id = discourse_id
  307. self.sentence_id_offset = None
  308. self.quote_chars = [("'", "'", "\\", False)]
  309. def parse(self, data, signature=None):
  310. return DrtParser.parse(self, data, signature)
  311. def get_all_symbols(self):
  312. return ['(', ')', ',', '[', ']', ':']
  313. def handle(self, tok, context):
  314. return self.handle_drs(tok)
  315. def attempt_adjuncts(self, expression, context):
  316. return expression
  317. def parse_condition(self, indices):
  318. """
  319. Parse a DRS condition
  320. :return: list of ``DrtExpression``
  321. """
  322. tok = self.token()
  323. accum = self.handle_condition(tok, indices)
  324. if accum is None:
  325. raise UnexpectedTokenException(tok)
  326. return accum
  327. def handle_drs(self, tok):
  328. if tok == 'drs':
  329. return self.parse_drs()
  330. elif tok in ['merge', 'smerge']:
  331. return self._handle_binary_expression(self._make_merge_expression)(None, [])
  332. elif tok in ['alfa']:
  333. return self._handle_alfa(self._make_merge_expression)(None, [])
  334. def handle_condition(self, tok, indices):
  335. """
  336. Handle a DRS condition
  337. :param indices: list of int
  338. :return: list of ``DrtExpression``
  339. """
  340. if tok == 'not':
  341. return [self._handle_not()]
  342. if tok == 'or':
  343. conds = [self._handle_binary_expression(self._make_or_expression)]
  344. elif tok == 'imp':
  345. conds = [self._handle_binary_expression(self._make_imp_expression)]
  346. elif tok == 'eq':
  347. conds = [self._handle_eq()]
  348. elif tok == 'prop':
  349. conds = [self._handle_prop()]
  350. elif tok == 'pred':
  351. conds = [self._handle_pred()]
  352. elif tok == 'named':
  353. conds = [self._handle_named()]
  354. elif tok == 'rel':
  355. conds = [self._handle_rel()]
  356. elif tok == 'timex':
  357. conds = self._handle_timex()
  358. elif tok == 'card':
  359. conds = [self._handle_card()]
  360. elif tok == 'whq':
  361. conds = [self._handle_whq()]
  362. elif tok == 'duplex':
  363. conds = [self._handle_duplex()]
  364. else:
  365. conds = []
  366. return sum(
  367. [
  368. [cond(sent_index, word_indices) for cond in conds]
  369. for sent_index, word_indices in self._sent_and_word_indices(indices)
  370. ],
  371. [],
  372. )
  373. def _handle_not(self):
  374. self.assertToken(self.token(), '(')
  375. drs = self.process_next_expression(None)
  376. self.assertToken(self.token(), ')')
  377. return BoxerNot(drs)
  378. def _handle_pred(self):
  379. # pred(_G3943, dog, n, 0)
  380. self.assertToken(self.token(), '(')
  381. variable = self.parse_variable()
  382. self.assertToken(self.token(), ',')
  383. name = self.token()
  384. self.assertToken(self.token(), ',')
  385. pos = self.token()
  386. self.assertToken(self.token(), ',')
  387. sense = int(self.token())
  388. self.assertToken(self.token(), ')')
  389. def _handle_pred_f(sent_index, word_indices):
  390. return BoxerPred(
  391. self.discourse_id, sent_index, word_indices, variable, name, pos, sense
  392. )
  393. return _handle_pred_f
  394. def _handle_duplex(self):
  395. # duplex(whq, drs(...), var, drs(...))
  396. self.assertToken(self.token(), '(')
  397. # self.assertToken(self.token(), '[')
  398. ans_types = []
  399. # while self.token(0) != ']':
  400. # cat = self.token()
  401. # self.assertToken(self.token(), ':')
  402. # if cat == 'des':
  403. # ans_types.append(self.token())
  404. # elif cat == 'num':
  405. # ans_types.append('number')
  406. # typ = self.token()
  407. # if typ == 'cou':
  408. # ans_types.append('count')
  409. # else:
  410. # ans_types.append(typ)
  411. # else:
  412. # ans_types.append(self.token())
  413. # self.token() #swallow the ']'
  414. self.assertToken(self.token(), 'whq')
  415. self.assertToken(self.token(), ',')
  416. d1 = self.process_next_expression(None)
  417. self.assertToken(self.token(), ',')
  418. ref = self.parse_variable()
  419. self.assertToken(self.token(), ',')
  420. d2 = self.process_next_expression(None)
  421. self.assertToken(self.token(), ')')
  422. return lambda sent_index, word_indices: BoxerWhq(
  423. self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
  424. )
  425. def _handle_named(self):
  426. # named(x0, john, per, 0)
  427. self.assertToken(self.token(), '(')
  428. variable = self.parse_variable()
  429. self.assertToken(self.token(), ',')
  430. name = self.token()
  431. self.assertToken(self.token(), ',')
  432. type = self.token()
  433. self.assertToken(self.token(), ',')
  434. sense = self.token() # as per boxer rev 2554
  435. self.assertToken(self.token(), ')')
  436. return lambda sent_index, word_indices: BoxerNamed(
  437. self.discourse_id, sent_index, word_indices, variable, name, type, sense
  438. )
  439. def _handle_rel(self):
  440. # rel(_G3993, _G3943, agent, 0)
  441. self.assertToken(self.token(), '(')
  442. var1 = self.parse_variable()
  443. self.assertToken(self.token(), ',')
  444. var2 = self.parse_variable()
  445. self.assertToken(self.token(), ',')
  446. rel = self.token()
  447. self.assertToken(self.token(), ',')
  448. sense = int(self.token())
  449. self.assertToken(self.token(), ')')
  450. return lambda sent_index, word_indices: BoxerRel(
  451. self.discourse_id, sent_index, word_indices, var1, var2, rel, sense
  452. )
  453. def _handle_timex(self):
  454. # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX'))
  455. self.assertToken(self.token(), '(')
  456. arg = self.parse_variable()
  457. self.assertToken(self.token(), ',')
  458. new_conds = self._handle_time_expression(arg)
  459. self.assertToken(self.token(), ')')
  460. return new_conds
  461. def _handle_time_expression(self, arg):
  462. # date([]: (+), []:'XXXX', [1004]:'04', []:'XX')
  463. tok = self.token()
  464. self.assertToken(self.token(), '(')
  465. if tok == 'date':
  466. conds = self._handle_date(arg)
  467. elif tok == 'time':
  468. conds = self._handle_time(arg)
  469. else:
  470. return None
  471. self.assertToken(self.token(), ')')
  472. return [
  473. lambda sent_index, word_indices: BoxerPred(
  474. self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0
  475. )
  476. ] + [lambda sent_index, word_indices: cond for cond in conds]
  477. def _handle_date(self, arg):
  478. # []: (+), []:'XXXX', [1004]:'04', []:'XX'
  479. conds = []
  480. (sent_index, word_indices), = self._sent_and_word_indices(
  481. self._parse_index_list()
  482. )
  483. self.assertToken(self.token(), '(')
  484. pol = self.token()
  485. self.assertToken(self.token(), ')')
  486. conds.append(
  487. BoxerPred(
  488. self.discourse_id,
  489. sent_index,
  490. word_indices,
  491. arg,
  492. 'date_pol_{0}'.format(pol),
  493. 'a',
  494. 0,
  495. )
  496. )
  497. self.assertToken(self.token(), ',')
  498. (sent_index, word_indices), = self._sent_and_word_indices(
  499. self._parse_index_list()
  500. )
  501. year = self.token()
  502. if year != 'XXXX':
  503. year = year.replace(':', '_')
  504. conds.append(
  505. BoxerPred(
  506. self.discourse_id,
  507. sent_index,
  508. word_indices,
  509. arg,
  510. 'date_year_{0}'.format(year),
  511. 'a',
  512. 0,
  513. )
  514. )
  515. self.assertToken(self.token(), ',')
  516. (sent_index, word_indices), = self._sent_and_word_indices(
  517. self._parse_index_list()
  518. )
  519. month = self.token()
  520. if month != 'XX':
  521. conds.append(
  522. BoxerPred(
  523. self.discourse_id,
  524. sent_index,
  525. word_indices,
  526. arg,
  527. 'date_month_{0}'.format(month),
  528. 'a',
  529. 0,
  530. )
  531. )
  532. self.assertToken(self.token(), ',')
  533. (sent_index, word_indices), = self._sent_and_word_indices(
  534. self._parse_index_list()
  535. )
  536. day = self.token()
  537. if day != 'XX':
  538. conds.append(
  539. BoxerPred(
  540. self.discourse_id,
  541. sent_index,
  542. word_indices,
  543. arg,
  544. 'date_day_{0}'.format(day),
  545. 'a',
  546. 0,
  547. )
  548. )
  549. return conds
  550. def _handle_time(self, arg):
  551. # time([1018]:'18', []:'XX', []:'XX')
  552. conds = []
  553. self._parse_index_list()
  554. hour = self.token()
  555. if hour != 'XX':
  556. conds.append(self._make_atom('r_hour_2', arg, hour))
  557. self.assertToken(self.token(), ',')
  558. self._parse_index_list()
  559. min = self.token()
  560. if min != 'XX':
  561. conds.append(self._make_atom('r_min_2', arg, min))
  562. self.assertToken(self.token(), ',')
  563. self._parse_index_list()
  564. sec = self.token()
  565. if sec != 'XX':
  566. conds.append(self._make_atom('r_sec_2', arg, sec))
  567. return conds
  568. def _handle_card(self):
  569. # card(_G18535, 28, ge)
  570. self.assertToken(self.token(), '(')
  571. variable = self.parse_variable()
  572. self.assertToken(self.token(), ',')
  573. value = self.token()
  574. self.assertToken(self.token(), ',')
  575. type = self.token()
  576. self.assertToken(self.token(), ')')
  577. return lambda sent_index, word_indices: BoxerCard(
  578. self.discourse_id, sent_index, word_indices, variable, value, type
  579. )
  580. def _handle_prop(self):
  581. # prop(_G15949, drs(...))
  582. self.assertToken(self.token(), '(')
  583. variable = self.parse_variable()
  584. self.assertToken(self.token(), ',')
  585. drs = self.process_next_expression(None)
  586. self.assertToken(self.token(), ')')
  587. return lambda sent_index, word_indices: BoxerProp(
  588. self.discourse_id, sent_index, word_indices, variable, drs
  589. )
  590. def _parse_index_list(self):
  591. # [1001,1002]:
  592. indices = []
  593. self.assertToken(self.token(), '[')
  594. while self.token(0) != ']':
  595. indices.append(self.parse_index())
  596. if self.token(0) == ',':
  597. self.token() # swallow ','
  598. self.token() # swallow ']'
  599. self.assertToken(self.token(), ':')
  600. return indices
  601. def parse_drs(self):
  602. # drs([[1001]:_G3943],
  603. # [[1002]:pred(_G3943, dog, n, 0)]
  604. # )
  605. self.assertToken(self.token(), '(')
  606. self.assertToken(self.token(), '[')
  607. refs = set()
  608. while self.token(0) != ']':
  609. indices = self._parse_index_list()
  610. refs.add(self.parse_variable())
  611. if self.token(0) == ',':
  612. self.token() # swallow ','
  613. self.token() # swallow ']'
  614. self.assertToken(self.token(), ',')
  615. self.assertToken(self.token(), '[')
  616. conds = []
  617. while self.token(0) != ']':
  618. indices = self._parse_index_list()
  619. conds.extend(self.parse_condition(indices))
  620. if self.token(0) == ',':
  621. self.token() # swallow ','
  622. self.token() # swallow ']'
  623. self.assertToken(self.token(), ')')
  624. return BoxerDrs(list(refs), conds)
  625. def _handle_binary_expression(self, make_callback):
  626. self.assertToken(self.token(), '(')
  627. drs1 = self.process_next_expression(None)
  628. self.assertToken(self.token(), ',')
  629. drs2 = self.process_next_expression(None)
  630. self.assertToken(self.token(), ')')
  631. return lambda sent_index, word_indices: make_callback(
  632. sent_index, word_indices, drs1, drs2
  633. )
  634. def _handle_alfa(self, make_callback):
  635. self.assertToken(self.token(), '(')
  636. type = self.token()
  637. self.assertToken(self.token(), ',')
  638. drs1 = self.process_next_expression(None)
  639. self.assertToken(self.token(), ',')
  640. drs2 = self.process_next_expression(None)
  641. self.assertToken(self.token(), ')')
  642. return lambda sent_index, word_indices: make_callback(
  643. sent_index, word_indices, drs1, drs2
  644. )
  645. def _handle_eq(self):
  646. self.assertToken(self.token(), '(')
  647. var1 = self.parse_variable()
  648. self.assertToken(self.token(), ',')
  649. var2 = self.parse_variable()
  650. self.assertToken(self.token(), ')')
  651. return lambda sent_index, word_indices: BoxerEq(
  652. self.discourse_id, sent_index, word_indices, var1, var2
  653. )
  654. def _handle_whq(self):
  655. self.assertToken(self.token(), '(')
  656. self.assertToken(self.token(), '[')
  657. ans_types = []
  658. while self.token(0) != ']':
  659. cat = self.token()
  660. self.assertToken(self.token(), ':')
  661. if cat == 'des':
  662. ans_types.append(self.token())
  663. elif cat == 'num':
  664. ans_types.append('number')
  665. typ = self.token()
  666. if typ == 'cou':
  667. ans_types.append('count')
  668. else:
  669. ans_types.append(typ)
  670. else:
  671. ans_types.append(self.token())
  672. self.token() # swallow the ']'
  673. self.assertToken(self.token(), ',')
  674. d1 = self.process_next_expression(None)
  675. self.assertToken(self.token(), ',')
  676. ref = self.parse_variable()
  677. self.assertToken(self.token(), ',')
  678. d2 = self.process_next_expression(None)
  679. self.assertToken(self.token(), ')')
  680. return lambda sent_index, word_indices: BoxerWhq(
  681. self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2
  682. )
  683. def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
  684. return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
  685. def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
  686. return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
  687. def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
  688. return BoxerDrs(drs1.refs, drs1.conds, drs2)
  689. def parse_variable(self):
  690. var = self.token()
  691. assert re.match('^[exps]\d+$', var), var
  692. return var
  693. def parse_index(self):
  694. return int(self.token())
  695. def _sent_and_word_indices(self, indices):
  696. """
  697. :return: list of (sent_index, word_indices) tuples
  698. """
  699. sent_indices = set((i / 1000) - 1 for i in indices if i >= 0)
  700. if sent_indices:
  701. pairs = []
  702. for sent_index in sent_indices:
  703. word_indices = [
  704. (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1
  705. ]
  706. pairs.append((sent_index, word_indices))
  707. return pairs
  708. else:
  709. word_indices = [(i % 1000) - 1 for i in indices]
  710. return [(None, word_indices)]
  711. class BoxerDrsParser(DrtParser):
  712. """
  713. Reparse the str form of subclasses of ``AbstractBoxerDrs``
  714. """
  715. def __init__(self, discourse_id=None):
  716. DrtParser.__init__(self)
  717. self.discourse_id = discourse_id
  718. def get_all_symbols(self):
  719. return [
  720. DrtTokens.OPEN,
  721. DrtTokens.CLOSE,
  722. DrtTokens.COMMA,
  723. DrtTokens.OPEN_BRACKET,
  724. DrtTokens.CLOSE_BRACKET,
  725. ]
  726. def attempt_adjuncts(self, expression, context):
  727. return expression
  728. def handle(self, tok, context):
  729. try:
  730. # if tok == 'drs':
  731. # self.assertNextToken(DrtTokens.OPEN)
  732. # label = int(self.token())
  733. # self.assertNextToken(DrtTokens.COMMA)
  734. # refs = list(map(int, self.handle_refs()))
  735. # self.assertNextToken(DrtTokens.COMMA)
  736. # conds = self.handle_conds(None)
  737. # self.assertNextToken(DrtTokens.CLOSE)
  738. # return BoxerDrs(label, refs, conds)
  739. if tok == 'pred':
  740. self.assertNextToken(DrtTokens.OPEN)
  741. disc_id = (
  742. self.discourse_id if self.discourse_id is not None else self.token()
  743. )
  744. self.assertNextToken(DrtTokens.COMMA)
  745. sent_id = self.nullableIntToken()
  746. self.assertNextToken(DrtTokens.COMMA)
  747. word_ids = list(map(int, self.handle_refs()))
  748. self.assertNextToken(DrtTokens.COMMA)
  749. variable = int(self.token())
  750. self.assertNextToken(DrtTokens.COMMA)
  751. name = self.token()
  752. self.assertNextToken(DrtTokens.COMMA)
  753. pos = self.token()
  754. self.assertNextToken(DrtTokens.COMMA)
  755. sense = int(self.token())
  756. self.assertNextToken(DrtTokens.CLOSE)
  757. return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense)
  758. elif tok == 'named':
  759. self.assertNextToken(DrtTokens.OPEN)
  760. disc_id = (
  761. self.discourse_id if self.discourse_id is not None else self.token()
  762. )
  763. self.assertNextToken(DrtTokens.COMMA)
  764. sent_id = int(self.token())
  765. self.assertNextToken(DrtTokens.COMMA)
  766. word_ids = map(int, self.handle_refs())
  767. self.assertNextToken(DrtTokens.COMMA)
  768. variable = int(self.token())
  769. self.assertNextToken(DrtTokens.COMMA)
  770. name = self.token()
  771. self.assertNextToken(DrtTokens.COMMA)
  772. type = self.token()
  773. self.assertNextToken(DrtTokens.COMMA)
  774. sense = int(self.token())
  775. self.assertNextToken(DrtTokens.CLOSE)
  776. return BoxerNamed(
  777. disc_id, sent_id, word_ids, variable, name, type, sense
  778. )
  779. elif tok == 'rel':
  780. self.assertNextToken(DrtTokens.OPEN)
  781. disc_id = (
  782. self.discourse_id if self.discourse_id is not None else self.token()
  783. )
  784. self.assertNextToken(DrtTokens.COMMA)
  785. sent_id = self.nullableIntToken()
  786. self.assertNextToken(DrtTokens.COMMA)
  787. word_ids = list(map(int, self.handle_refs()))
  788. self.assertNextToken(DrtTokens.COMMA)
  789. var1 = int(self.token())
  790. self.assertNextToken(DrtTokens.COMMA)
  791. var2 = int(self.token())
  792. self.assertNextToken(DrtTokens.COMMA)
  793. rel = self.token()
  794. self.assertNextToken(DrtTokens.COMMA)
  795. sense = int(self.token())
  796. self.assertNextToken(DrtTokens.CLOSE)
  797. return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense)
  798. elif tok == 'prop':
  799. self.assertNextToken(DrtTokens.OPEN)
  800. disc_id = (
  801. self.discourse_id if self.discourse_id is not None else self.token()
  802. )
  803. self.assertNextToken(DrtTokens.COMMA)
  804. sent_id = int(self.token())
  805. self.assertNextToken(DrtTokens.COMMA)
  806. word_ids = list(map(int, self.handle_refs()))
  807. self.assertNextToken(DrtTokens.COMMA)
  808. variable = int(self.token())
  809. self.assertNextToken(DrtTokens.COMMA)
  810. drs = self.process_next_expression(None)
  811. self.assertNextToken(DrtTokens.CLOSE)
  812. return BoxerProp(disc_id, sent_id, word_ids, variable, drs)
  813. elif tok == 'not':
  814. self.assertNextToken(DrtTokens.OPEN)
  815. drs = self.process_next_expression(None)
  816. self.assertNextToken(DrtTokens.CLOSE)
  817. return BoxerNot(drs)
  818. elif tok == 'imp':
  819. self.assertNextToken(DrtTokens.OPEN)
  820. drs1 = self.process_next_expression(None)
  821. self.assertNextToken(DrtTokens.COMMA)
  822. drs2 = self.process_next_expression(None)
  823. self.assertNextToken(DrtTokens.CLOSE)
  824. return BoxerDrs(drs1.refs, drs1.conds, drs2)
  825. elif tok == 'or':
  826. self.assertNextToken(DrtTokens.OPEN)
  827. disc_id = (
  828. self.discourse_id if self.discourse_id is not None else self.token()
  829. )
  830. self.assertNextToken(DrtTokens.COMMA)
  831. sent_id = self.nullableIntToken()
  832. self.assertNextToken(DrtTokens.COMMA)
  833. word_ids = map(int, self.handle_refs())
  834. self.assertNextToken(DrtTokens.COMMA)
  835. drs1 = self.process_next_expression(None)
  836. self.assertNextToken(DrtTokens.COMMA)
  837. drs2 = self.process_next_expression(None)
  838. self.assertNextToken(DrtTokens.CLOSE)
  839. return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2)
  840. elif tok == 'eq':
  841. self.assertNextToken(DrtTokens.OPEN)
  842. disc_id = (
  843. self.discourse_id if self.discourse_id is not None else self.token()
  844. )
  845. self.assertNextToken(DrtTokens.COMMA)
  846. sent_id = self.nullableIntToken()
  847. self.assertNextToken(DrtTokens.COMMA)
  848. word_ids = list(map(int, self.handle_refs()))
  849. self.assertNextToken(DrtTokens.COMMA)
  850. var1 = int(self.token())
  851. self.assertNextToken(DrtTokens.COMMA)
  852. var2 = int(self.token())
  853. self.assertNextToken(DrtTokens.CLOSE)
  854. return BoxerEq(disc_id, sent_id, word_ids, var1, var2)
  855. elif tok == 'card':
  856. self.assertNextToken(DrtTokens.OPEN)
  857. disc_id = (
  858. self.discourse_id if self.discourse_id is not None else self.token()
  859. )
  860. self.assertNextToken(DrtTokens.COMMA)
  861. sent_id = self.nullableIntToken()
  862. self.assertNextToken(DrtTokens.COMMA)
  863. word_ids = map(int, self.handle_refs())
  864. self.assertNextToken(DrtTokens.COMMA)
  865. var = int(self.token())
  866. self.assertNextToken(DrtTokens.COMMA)
  867. value = self.token()
  868. self.assertNextToken(DrtTokens.COMMA)
  869. type = self.token()
  870. self.assertNextToken(DrtTokens.CLOSE)
  871. return BoxerCard(disc_id, sent_id, word_ids, var, value, type)
  872. elif tok == 'whq':
  873. self.assertNextToken(DrtTokens.OPEN)
  874. disc_id = (
  875. self.discourse_id if self.discourse_id is not None else self.token()
  876. )
  877. self.assertNextToken(DrtTokens.COMMA)
  878. sent_id = self.nullableIntToken()
  879. self.assertNextToken(DrtTokens.COMMA)
  880. word_ids = list(map(int, self.handle_refs()))
  881. self.assertNextToken(DrtTokens.COMMA)
  882. ans_types = self.handle_refs()
  883. self.assertNextToken(DrtTokens.COMMA)
  884. drs1 = self.process_next_expression(None)
  885. self.assertNextToken(DrtTokens.COMMA)
  886. var = int(self.token())
  887. self.assertNextToken(DrtTokens.COMMA)
  888. drs2 = self.process_next_expression(None)
  889. self.assertNextToken(DrtTokens.CLOSE)
  890. return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2)
  891. except Exception as e:
  892. raise LogicalExpressionException(self._currentIndex, str(e))
  893. assert False, repr(tok)
  894. def nullableIntToken(self):
  895. t = self.token()
  896. return int(t) if t != 'None' else None
  897. def get_next_token_variable(self, description):
  898. try:
  899. return self.token()
  900. except ExpectedMoreTokensException as e:
  901. raise ExpectedMoreTokensException(e.index, 'Variable expected.')
  902. class AbstractBoxerDrs(object):
  903. def variables(self):
  904. """
  905. :return: (set<variables>, set<events>, set<propositions>)
  906. """
  907. variables, events, propositions = self._variables()
  908. return (variables - (events | propositions), events, propositions - events)
  909. def variable_types(self):
  910. vartypes = {}
  911. for t, vars in zip(('z', 'e', 'p'), self.variables()):
  912. for v in vars:
  913. vartypes[v] = t
  914. return vartypes
  915. def _variables(self):
  916. """
  917. :return: (set<variables>, set<events>, set<propositions>)
  918. """
  919. return (set(), set(), set())
  920. def atoms(self):
  921. return set()
  922. def clean(self):
  923. return self
  924. def _clean_name(self, name):
  925. return name.replace('-', '_').replace("'", "_")
  926. def renumber_sentences(self, f):
  927. return self
  928. def __hash__(self):
  929. return hash("{0}".format(self))
  930. @python_2_unicode_compatible
  931. class BoxerDrs(AbstractBoxerDrs):
  932. def __init__(self, refs, conds, consequent=None):
  933. AbstractBoxerDrs.__init__(self)
  934. self.refs = refs
  935. self.conds = conds
  936. self.consequent = consequent
  937. def _variables(self):
  938. variables = (set(), set(), set())
  939. for cond in self.conds:
  940. for s, v in zip(variables, cond._variables()):
  941. s.update(v)
  942. if self.consequent is not None:
  943. for s, v in zip(variables, self.consequent._variables()):
  944. s.update(v)
  945. return variables
  946. def atoms(self):
  947. atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set())
  948. if self.consequent is not None:
  949. atoms.update(self.consequent.atoms())
  950. return atoms
  951. def clean(self):
  952. consequent = self.consequent.clean() if self.consequent else None
  953. return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent)
  954. def renumber_sentences(self, f):
  955. consequent = self.consequent.renumber_sentences(f) if self.consequent else None
  956. return BoxerDrs(
  957. self.refs, [c.renumber_sentences(f) for c in self.conds], consequent
  958. )
  959. def __repr__(self):
  960. s = 'drs([%s], [%s])' % (
  961. ', '.join("%s" % r for r in self.refs),
  962. ', '.join("%s" % c for c in self.conds),
  963. )
  964. if self.consequent is not None:
  965. s = 'imp(%s, %s)' % (s, self.consequent)
  966. return s
  967. def __eq__(self, other):
  968. return (
  969. self.__class__ == other.__class__
  970. and self.refs == other.refs
  971. and len(self.conds) == len(other.conds)
  972. and reduce(
  973. operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds))
  974. )
  975. and self.consequent == other.consequent
  976. )
  977. def __ne__(self, other):
  978. return not self == other
  979. __hash__ = AbstractBoxerDrs.__hash__
  980. @python_2_unicode_compatible
  981. class BoxerNot(AbstractBoxerDrs):
  982. def __init__(self, drs):
  983. AbstractBoxerDrs.__init__(self)
  984. self.drs = drs
  985. def _variables(self):
  986. return self.drs._variables()
  987. def atoms(self):
  988. return self.drs.atoms()
  989. def clean(self):
  990. return BoxerNot(self.drs.clean())
  991. def renumber_sentences(self, f):
  992. return BoxerNot(self.drs.renumber_sentences(f))
  993. def __repr__(self):
  994. return 'not(%s)' % (self.drs)
  995. def __eq__(self, other):
  996. return self.__class__ == other.__class__ and self.drs == other.drs
  997. def __ne__(self, other):
  998. return not self == other
  999. __hash__ = AbstractBoxerDrs.__hash__
  1000. @python_2_unicode_compatible
  1001. class BoxerIndexed(AbstractBoxerDrs):
  1002. def __init__(self, discourse_id, sent_index, word_indices):
  1003. AbstractBoxerDrs.__init__(self)
  1004. self.discourse_id = discourse_id
  1005. self.sent_index = sent_index
  1006. self.word_indices = word_indices
  1007. def atoms(self):
  1008. return set([self])
  1009. def __eq__(self, other):
  1010. return (
  1011. self.__class__ == other.__class__
  1012. and self.discourse_id == other.discourse_id
  1013. and self.sent_index == other.sent_index
  1014. and self.word_indices == other.word_indices
  1015. and reduce(operator.and_, (s == o for s, o in zip(self, other)))
  1016. )
  1017. def __ne__(self, other):
  1018. return not self == other
  1019. __hash__ = AbstractBoxerDrs.__hash__
  1020. def __repr__(self):
  1021. s = '%s(%s, %s, [%s]' % (
  1022. self._pred(),
  1023. self.discourse_id,
  1024. self.sent_index,
  1025. ', '.join("%s" % wi for wi in self.word_indices),
  1026. )
  1027. for v in self:
  1028. s += ', %s' % v
  1029. return s + ')'
  1030. class BoxerPred(BoxerIndexed):
  1031. def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
  1032. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1033. self.var = var
  1034. self.name = name
  1035. self.pos = pos
  1036. self.sense = sense
  1037. def _variables(self):
  1038. return (set([self.var]), set(), set())
  1039. def change_var(self, var):
  1040. return BoxerPred(
  1041. self.discourse_id,
  1042. self.sent_index,
  1043. self.word_indices,
  1044. var,
  1045. self.name,
  1046. self.pos,
  1047. self.sense,
  1048. )
  1049. def clean(self):
  1050. return BoxerPred(
  1051. self.discourse_id,
  1052. self.sent_index,
  1053. self.word_indices,
  1054. self.var,
  1055. self._clean_name(self.name),
  1056. self.pos,
  1057. self.sense,
  1058. )
  1059. def renumber_sentences(self, f):
  1060. new_sent_index = f(self.sent_index)
  1061. return BoxerPred(
  1062. self.discourse_id,
  1063. new_sent_index,
  1064. self.word_indices,
  1065. self.var,
  1066. self.name,
  1067. self.pos,
  1068. self.sense,
  1069. )
  1070. def __iter__(self):
  1071. return iter((self.var, self.name, self.pos, self.sense))
  1072. def _pred(self):
  1073. return 'pred'
  1074. class BoxerNamed(BoxerIndexed):
  1075. def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
  1076. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1077. self.var = var
  1078. self.name = name
  1079. self.type = type
  1080. self.sense = sense
  1081. def _variables(self):
  1082. return (set([self.var]), set(), set())
  1083. def change_var(self, var):
  1084. return BoxerNamed(
  1085. self.discourse_id,
  1086. self.sent_index,
  1087. self.word_indices,
  1088. var,
  1089. self.name,
  1090. self.type,
  1091. self.sense,
  1092. )
  1093. def clean(self):
  1094. return BoxerNamed(
  1095. self.discourse_id,
  1096. self.sent_index,
  1097. self.word_indices,
  1098. self.var,
  1099. self._clean_name(self.name),
  1100. self.type,
  1101. self.sense,
  1102. )
  1103. def renumber_sentences(self, f):
  1104. return BoxerNamed(
  1105. self.discourse_id,
  1106. f(self.sent_index),
  1107. self.word_indices,
  1108. self.var,
  1109. self.name,
  1110. self.type,
  1111. self.sense,
  1112. )
  1113. def __iter__(self):
  1114. return iter((self.var, self.name, self.type, self.sense))
  1115. def _pred(self):
  1116. return 'named'
  1117. class BoxerRel(BoxerIndexed):
  1118. def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
  1119. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1120. self.var1 = var1
  1121. self.var2 = var2
  1122. self.rel = rel
  1123. self.sense = sense
  1124. def _variables(self):
  1125. return (set([self.var1, self.var2]), set(), set())
  1126. def clean(self):
  1127. return BoxerRel(
  1128. self.discourse_id,
  1129. self.sent_index,
  1130. self.word_indices,
  1131. self.var1,
  1132. self.var2,
  1133. self._clean_name(self.rel),
  1134. self.sense,
  1135. )
  1136. def renumber_sentences(self, f):
  1137. return BoxerRel(
  1138. self.discourse_id,
  1139. f(self.sent_index),
  1140. self.word_indices,
  1141. self.var1,
  1142. self.var2,
  1143. self.rel,
  1144. self.sense,
  1145. )
  1146. def __iter__(self):
  1147. return iter((self.var1, self.var2, self.rel, self.sense))
  1148. def _pred(self):
  1149. return 'rel'
  1150. class BoxerProp(BoxerIndexed):
  1151. def __init__(self, discourse_id, sent_index, word_indices, var, drs):
  1152. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1153. self.var = var
  1154. self.drs = drs
  1155. def _variables(self):
  1156. return tuple(
  1157. map(operator.or_, (set(), set(), set([self.var])), self.drs._variables())
  1158. )
  1159. def referenced_labels(self):
  1160. return set([self.drs])
  1161. def atoms(self):
  1162. return self.drs.atoms()
  1163. def clean(self):
  1164. return BoxerProp(
  1165. self.discourse_id,
  1166. self.sent_index,
  1167. self.word_indices,
  1168. self.var,
  1169. self.drs.clean(),
  1170. )
  1171. def renumber_sentences(self, f):
  1172. return BoxerProp(
  1173. self.discourse_id,
  1174. f(self.sent_index),
  1175. self.word_indices,
  1176. self.var,
  1177. self.drs.renumber_sentences(f),
  1178. )
  1179. def __iter__(self):
  1180. return iter((self.var, self.drs))
  1181. def _pred(self):
  1182. return 'prop'
  1183. class BoxerEq(BoxerIndexed):
  1184. def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
  1185. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1186. self.var1 = var1
  1187. self.var2 = var2
  1188. def _variables(self):
  1189. return (set([self.var1, self.var2]), set(), set())
  1190. def atoms(self):
  1191. return set()
  1192. def renumber_sentences(self, f):
  1193. return BoxerEq(
  1194. self.discourse_id,
  1195. f(self.sent_index),
  1196. self.word_indices,
  1197. self.var1,
  1198. self.var2,
  1199. )
  1200. def __iter__(self):
  1201. return iter((self.var1, self.var2))
  1202. def _pred(self):
  1203. return 'eq'
  1204. class BoxerCard(BoxerIndexed):
  1205. def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
  1206. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1207. self.var = var
  1208. self.value = value
  1209. self.type = type
  1210. def _variables(self):
  1211. return (set([self.var]), set(), set())
  1212. def renumber_sentences(self, f):
  1213. return BoxerCard(
  1214. self.discourse_id,
  1215. f(self.sent_index),
  1216. self.word_indices,
  1217. self.var,
  1218. self.value,
  1219. self.type,
  1220. )
  1221. def __iter__(self):
  1222. return iter((self.var, self.value, self.type))
  1223. def _pred(self):
  1224. return 'card'
  1225. class BoxerOr(BoxerIndexed):
  1226. def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
  1227. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1228. self.drs1 = drs1
  1229. self.drs2 = drs2
  1230. def _variables(self):
  1231. return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
  1232. def atoms(self):
  1233. return self.drs1.atoms() | self.drs2.atoms()
  1234. def clean(self):
  1235. return BoxerOr(
  1236. self.discourse_id,
  1237. self.sent_index,
  1238. self.word_indices,
  1239. self.drs1.clean(),
  1240. self.drs2.clean(),
  1241. )
  1242. def renumber_sentences(self, f):
  1243. return BoxerOr(
  1244. self.discourse_id,
  1245. f(self.sent_index),
  1246. self.word_indices,
  1247. self.drs1,
  1248. self.drs2,
  1249. )
  1250. def __iter__(self):
  1251. return iter((self.drs1, self.drs2))
  1252. def _pred(self):
  1253. return 'or'
  1254. class BoxerWhq(BoxerIndexed):
  1255. def __init__(
  1256. self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2
  1257. ):
  1258. BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices)
  1259. self.ans_types = ans_types
  1260. self.drs1 = drs1
  1261. self.variable = variable
  1262. self.drs2 = drs2
  1263. def _variables(self):
  1264. return tuple(
  1265. map(
  1266. operator.or_,
  1267. (set([self.variable]), set(), set()),
  1268. self.drs1._variables(),
  1269. self.drs2._variables(),
  1270. )
  1271. )
  1272. def atoms(self):
  1273. return self.drs1.atoms() | self.drs2.atoms()
  1274. def clean(self):
  1275. return BoxerWhq(
  1276. self.discourse_id,
  1277. self.sent_index,
  1278. self.word_indices,
  1279. self.ans_types,
  1280. self.drs1.clean(),
  1281. self.variable,
  1282. self.drs2.clean(),
  1283. )
  1284. def renumber_sentences(self, f):
  1285. return BoxerWhq(
  1286. self.discourse_id,
  1287. f(self.sent_index),
  1288. self.word_indices,
  1289. self.ans_types,
  1290. self.drs1,
  1291. self.variable,
  1292. self.drs2,
  1293. )
  1294. def __iter__(self):
  1295. return iter(
  1296. ('[' + ','.join(self.ans_types) + ']', self.drs1, self.variable, self.drs2)
  1297. )
  1298. def _pred(self):
  1299. return 'whq'
  1300. class PassthroughBoxerDrsInterpreter(object):
  1301. def interpret(self, ex):
  1302. return ex
  1303. class NltkDrtBoxerDrsInterpreter(object):
  1304. def __init__(self, occur_index=False):
  1305. self._occur_index = occur_index
  1306. def interpret(self, ex):
  1307. """
  1308. :param ex: ``AbstractBoxerDrs``
  1309. :return: ``DrtExpression``
  1310. """
  1311. if isinstance(ex, BoxerDrs):
  1312. drs = DRS(
  1313. [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))
  1314. )
  1315. if ex.consequent is not None:
  1316. drs.consequent = self.interpret(ex.consequent)
  1317. return drs
  1318. elif isinstance(ex, BoxerNot):
  1319. return DrtNegatedExpression(self.interpret(ex.drs))
  1320. elif isinstance(ex, BoxerPred):
  1321. pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex)
  1322. return self._make_atom(pred, ex.var)
  1323. elif isinstance(ex, BoxerNamed):
  1324. pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex)
  1325. return self._make_atom(pred, ex.var)
  1326. elif isinstance(ex, BoxerRel):
  1327. pred = self._add_occur_indexing('%s' % (ex.rel), ex)
  1328. return self._make_atom(pred, ex.var1, ex.var2)
  1329. elif isinstance(ex, BoxerProp):
  1330. return DrtProposition(Variable(ex.var), self.interpret(ex.drs))
  1331. elif isinstance(ex, BoxerEq):
  1332. return DrtEqualityExpression(
  1333. DrtVariableExpression(Variable(ex.var1)),
  1334. DrtVariableExpression(Variable(ex.var2)),
  1335. )
  1336. elif isinstance(ex, BoxerCard):
  1337. pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex)
  1338. return self._make_atom(pred, ex.var)
  1339. elif isinstance(ex, BoxerOr):
  1340. return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2))
  1341. elif isinstance(ex, BoxerWhq):
  1342. drs1 = self.interpret(ex.drs1)
  1343. drs2 = self.interpret(ex.drs2)
  1344. return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds)
  1345. assert False, '%s: %s' % (ex.__class__.__name__, ex)
  1346. def _make_atom(self, pred, *args):
  1347. accum = DrtVariableExpression(Variable(pred))
  1348. for arg in args:
  1349. accum = DrtApplicationExpression(
  1350. accum, DrtVariableExpression(Variable(arg))
  1351. )
  1352. return accum
  1353. def _add_occur_indexing(self, base, ex):
  1354. if self._occur_index and ex.sent_index is not None:
  1355. if ex.discourse_id:
  1356. base += '_%s' % ex.discourse_id
  1357. base += '_s%s' % ex.sent_index
  1358. base += '_w%s' % sorted(ex.word_indices)[0]
  1359. return base
  1360. class UnparseableInputException(Exception):
  1361. pass
  1362. if __name__ == '__main__':
  1363. opts = OptionParser("usage: %prog TEXT [options]")
  1364. opts.add_option(
  1365. "--verbose",
  1366. "-v",
  1367. help="display verbose logs",
  1368. action="store_true",
  1369. default=False,
  1370. dest="verbose",
  1371. )
  1372. opts.add_option(
  1373. "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol"
  1374. )
  1375. opts.add_option(
  1376. "--question",
  1377. "-q",
  1378. help="input is a question",
  1379. action="store_true",
  1380. default=False,
  1381. dest="question",
  1382. )
  1383. opts.add_option(
  1384. "--occur",
  1385. "-o",
  1386. help="occurrence index",
  1387. action="store_true",
  1388. default=False,
  1389. dest="occur_index",
  1390. )
  1391. (options, args) = opts.parse_args()
  1392. if len(args) != 1:
  1393. opts.error("incorrect number of arguments")
  1394. interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index)
  1395. drs = Boxer(interpreter).interpret_multi(
  1396. args[0].split(r'\n'), question=options.question, verbose=options.verbose
  1397. )
  1398. if drs is None:
  1399. print(None)
  1400. else:
  1401. drs = drs.simplify().eliminate_equality()
  1402. if options.fol:
  1403. print(drs.fol().normalize())
  1404. else:
  1405. drs.pretty_print()