malt.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Interface to MaltParser
  3. #
  4. # Author: Dan Garrette <dhgarrette@gmail.com>
  5. # Contributor: Liling Tan, Mustufain, osamamukhtar11
  6. #
  7. # Copyright (C) 2001-2019 NLTK Project
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from __future__ import print_function, unicode_literals
  11. import os
  12. import sys
  13. import tempfile
  14. import subprocess
  15. import inspect
  16. from six import text_type
  17. from nltk.data import ZipFilePathPointer
  18. from nltk.internals import find_dir, find_file, find_jars_within_path
  19. from nltk.parse.api import ParserI
  20. from nltk.parse.dependencygraph import DependencyGraph
  21. from nltk.parse.util import taggedsents_to_conll
  22. def malt_regex_tagger():
  23. from nltk.tag import RegexpTagger
  24. _tagger = RegexpTagger(
  25. [
  26. (r'\.$', '.'),
  27. (r'\,$', ','),
  28. (r'\?$', '?'), # fullstop, comma, Qmark
  29. (r'\($', '('),
  30. (r'\)$', ')'), # round brackets
  31. (r'\[$', '['),
  32. (r'\]$', ']'), # square brackets
  33. (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
  34. (r'(The|the|A|a|An|an)$', 'DT'), # articles
  35. (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
  36. (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive
  37. (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive
  38. (r'(on|On|in|In|at|At|since|Since)$', 'IN'), # time prepopsitions
  39. (r'(for|For|ago|Ago|before|Before)$', 'IN'), # time prepopsitions
  40. (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions
  41. (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions
  42. (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions
  43. (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions
  44. (r'(across|Across|through|Through)$', 'IN'), # space prepopsitions
  45. (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions
  46. (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions
  47. (r'.*able$', 'JJ'), # adjectives
  48. (r'.*ness$', 'NN'), # nouns formed from adjectives
  49. (r'.*ly$', 'RB'), # adverbs
  50. (r'.*s$', 'NNS'), # plural nouns
  51. (r'.*ing$', 'VBG'), # gerunds
  52. (r'.*ed$', 'VBD'), # past tense verbs
  53. (r'.*', 'NN'), # nouns (default)
  54. ]
  55. )
  56. return _tagger.tag
  57. def find_maltparser(parser_dirname):
  58. """
  59. A module to find MaltParser .jar file and its dependencies.
  60. """
  61. if os.path.exists(parser_dirname): # If a full path is given.
  62. _malt_dir = parser_dirname
  63. else: # Try to find path to maltparser directory in environment variables.
  64. _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
  65. # Checks that that the found directory contains all the necessary .jar
  66. malt_dependencies = ['', '', '']
  67. _malt_jars = set(find_jars_within_path(_malt_dir))
  68. _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
  69. malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
  70. assert malt_dependencies.issubset(_jars)
  71. assert any(
  72. filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)
  73. )
  74. return list(_malt_jars)
  75. def find_malt_model(model_filename):
  76. """
  77. A module to find pre-trained MaltParser model.
  78. """
  79. if model_filename is None:
  80. return 'malt_temp.mco'
  81. elif os.path.exists(model_filename): # If a full path is given.
  82. return model_filename
  83. else: # Try to find path to malt model in environment variables.
  84. return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
  85. class MaltParser(ParserI):
  86. """
  87. A class for dependency parsing with MaltParser. The input is the paths to:
  88. - a maltparser directory
  89. - (optionally) the path to a pre-trained MaltParser .mco model file
  90. - (optionally) the tagger to use for POS tagging before parsing
  91. - (optionally) additional Java arguments
  92. Example:
  93. >>> from nltk.parse import malt
  94. >>> # With MALT_PARSER and MALT_MODEL environment set.
  95. >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
  96. >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
  97. (shot I (elephant an) (in (pajamas my)) .)
  98. >>> # Without MALT_PARSER and MALT_MODEL environment.
  99. >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
  100. >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
  101. (shot I (elephant an) (in (pajamas my)) .)
  102. """
  103. def __init__(
  104. self,
  105. parser_dirname,
  106. model_filename=None,
  107. tagger=None,
  108. additional_java_args=None,
  109. ):
  110. """
  111. An interface for parsing with the Malt Parser.
  112. :param parser_dirname: The path to the maltparser directory that
  113. contains the maltparser-1.x.jar
  114. :type parser_dirname: str
  115. :param model_filename: The name of the pre-trained model with .mco file
  116. extension. If provided, training will not be required.
  117. (see http://www.maltparser.org/mco/mco.html and
  118. see http://www.patful.com/chalk/node/185)
  119. :type model_filename: str
  120. :param tagger: The tagger used to POS tag the raw string before
  121. formatting to CONLL format. It should behave like `nltk.pos_tag`
  122. :type tagger: function
  123. :param additional_java_args: This is the additional Java arguments that
  124. one can use when calling Maltparser, usually this is the heapsize
  125. limits, e.g. `additional_java_args=['-Xmx1024m']`
  126. (see http://goo.gl/mpDBvQ)
  127. :type additional_java_args: list
  128. """
  129. # Find all the necessary jar files for MaltParser.
  130. self.malt_jars = find_maltparser(parser_dirname)
  131. # Initialize additional java arguments.
  132. self.additional_java_args = (
  133. additional_java_args if additional_java_args is not None else []
  134. )
  135. # Initialize model.
  136. self.model = find_malt_model(model_filename)
  137. self._trained = self.model != 'malt_temp.mco'
  138. # Set the working_dir parameters i.e. `-w` from MaltParser's option.
  139. self.working_dir = tempfile.gettempdir()
  140. # Initialize POS tagger.
  141. self.tagger = tagger if tagger is not None else malt_regex_tagger()
  142. def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
  143. """
  144. Use MaltParser to parse multiple POS tagged sentences. Takes multiple
  145. sentences where each sentence is a list of (word, tag) tuples.
  146. The sentences must have already been tokenized and tagged.
  147. :param sentences: Input sentences to parse
  148. :type sentence: list(list(tuple(str, str)))
  149. :return: iter(iter(``DependencyGraph``)) the dependency graph
  150. representation of each sentence
  151. """
  152. if not self._trained:
  153. raise Exception("Parser has not been trained. Call train() first.")
  154. with tempfile.NamedTemporaryFile(
  155. prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False
  156. ) as input_file:
  157. with tempfile.NamedTemporaryFile(
  158. prefix='malt_output.conll.',
  159. dir=self.working_dir,
  160. mode='w',
  161. delete=False,
  162. ) as output_file:
  163. # Convert list of sentences to CONLL format.
  164. for line in taggedsents_to_conll(sentences):
  165. input_file.write(text_type(line))
  166. input_file.close()
  167. # Generate command to run maltparser.
  168. cmd = self.generate_malt_command(
  169. input_file.name, output_file.name, mode="parse"
  170. )
  171. # This is a maltparser quirk, it needs to be run
  172. # where the model file is. otherwise it goes into an awkward
  173. # missing .jars or strange -w working_dir problem.
  174. _current_path = os.getcwd() # Remembers the current path.
  175. try: # Change to modelfile path
  176. os.chdir(os.path.split(self.model)[0])
  177. except:
  178. pass
  179. ret = self._execute(cmd, verbose) # Run command.
  180. os.chdir(_current_path) # Change back to current path.
  181. if ret is not 0:
  182. raise Exception(
  183. "MaltParser parsing (%s) failed with exit "
  184. "code %d" % (' '.join(cmd), ret)
  185. )
  186. # Must return iter(iter(Tree))
  187. with open(output_file.name) as infile:
  188. for tree_str in infile.read().split('\n\n'):
  189. yield (
  190. iter(
  191. [
  192. DependencyGraph(
  193. tree_str, top_relation_label=top_relation_label
  194. )
  195. ]
  196. )
  197. )
  198. os.remove(input_file.name)
  199. os.remove(output_file.name)
  200. def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
  201. """
  202. Use MaltParser to parse multiple sentences.
  203. Takes a list of sentences, where each sentence is a list of words.
  204. Each sentence will be automatically tagged with this
  205. MaltParser instance's tagger.
  206. :param sentences: Input sentences to parse
  207. :type sentence: list(list(str))
  208. :return: iter(DependencyGraph)
  209. """
  210. tagged_sentences = (self.tagger(sentence) for sentence in sentences)
  211. return self.parse_tagged_sents(
  212. tagged_sentences, verbose, top_relation_label=top_relation_label
  213. )
  214. def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
  215. """
  216. This function generates the maltparser command use at the terminal.
  217. :param inputfilename: path to the input file
  218. :type inputfilename: str
  219. :param outputfilename: path to the output file
  220. :type outputfilename: str
  221. """
  222. cmd = ['java']
  223. cmd += self.additional_java_args # Adds additional java arguments
  224. # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
  225. classpaths_separator = ';' if sys.platform.startswith('win') else ':'
  226. cmd += [
  227. '-cp',
  228. classpaths_separator.join(self.malt_jars),
  229. ] # Adds classpaths for jars
  230. cmd += ['org.maltparser.Malt'] # Adds the main function.
  231. # Adds the model file.
  232. if os.path.exists(self.model): # when parsing
  233. cmd += ['-c', os.path.split(self.model)[-1]]
  234. else: # when learning
  235. cmd += ['-c', self.model]
  236. cmd += ['-i', inputfilename]
  237. if mode == 'parse':
  238. cmd += ['-o', outputfilename]
  239. cmd += ['-m', mode] # mode use to generate parses.
  240. return cmd
  241. @staticmethod
  242. def _execute(cmd, verbose=False):
  243. output = None if verbose else subprocess.PIPE
  244. p = subprocess.Popen(cmd, stdout=output, stderr=output)
  245. return p.wait()
  246. def train(self, depgraphs, verbose=False):
  247. """
  248. Train MaltParser from a list of ``DependencyGraph`` objects
  249. :param depgraphs: list of ``DependencyGraph`` objects for training input data
  250. :type depgraphs: DependencyGraph
  251. """
  252. # Write the conll_str to malt_train.conll file in /tmp/
  253. with tempfile.NamedTemporaryFile(
  254. prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
  255. ) as input_file:
  256. input_str = '\n'.join(dg.to_conll(10) for dg in depgraphs)
  257. input_file.write(text_type(input_str))
  258. # Trains the model with the malt_train.conll
  259. self.train_from_file(input_file.name, verbose=verbose)
  260. # Removes the malt_train.conll once training finishes.
  261. os.remove(input_file.name)
  262. def train_from_file(self, conll_file, verbose=False):
  263. """
  264. Train MaltParser from a file
  265. :param conll_file: str for the filename of the training input data
  266. :type conll_file: str
  267. """
  268. # If conll_file is a ZipFilePathPointer,
  269. # then we need to do some extra massaging
  270. if isinstance(conll_file, ZipFilePathPointer):
  271. with tempfile.NamedTemporaryFile(
  272. prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
  273. ) as input_file:
  274. with conll_file.open() as conll_input_file:
  275. conll_str = conll_input_file.read()
  276. input_file.write(text_type(conll_str))
  277. return self.train_from_file(input_file.name, verbose=verbose)
  278. # Generate command to run maltparser.
  279. cmd = self.generate_malt_command(conll_file, mode="learn")
  280. ret = self._execute(cmd, verbose)
  281. if ret != 0:
  282. raise Exception(
  283. "MaltParser training (%s) failed with exit "
  284. "code %d" % (' '.join(cmd), ret)
  285. )
  286. self._trained = True
  287. if __name__ == '__main__':
  288. '''
  289. A demonstration function to show how NLTK users can use the malt parser API.
  290. >>> from nltk import pos_tag
  291. >>> assert 'MALT_PARSER' in os.environ, str(
  292. ... "Please set MALT_PARSER in your global environment, e.g.:\n"
  293. ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
  294. >>>
  295. >>> assert 'MALT_MODEL' in os.environ, str(
  296. ... "Please set MALT_MODEL in your global environment, e.g.:\n"
  297. ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
  298. >>>
  299. >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
  300. ... "2 sees _ VB _ _ 0 ROOT _ _\n"
  301. ... "3 a _ DT _ _ 4 SPEC _ _\n"
  302. ... "4 dog _ NN _ _ 2 OBJ _ _\n"
  303. ... "5 . _ . _ _ 2 PUNCT _ _\n")
  304. >>>
  305. >>>
  306. >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
  307. ... "2 walks _ VB _ _ 0 ROOT _ _\n"
  308. ... "3 . _ . _ _ 2 PUNCT _ _\n")
  309. >>> dg1 = DependencyGraph(_dg1_str)
  310. >>> dg2 = DependencyGraph(_dg2_str)
  311. >>> # Initialize a MaltParser object
  312. >>> parser_dirname = 'maltparser-1.7.2'
  313. >>> mp = MaltParser(parser_dirname=parser_dirname)
  314. >>>
  315. >>> # Trains a model.
  316. >>> mp.train([dg1,dg2], verbose=False)
  317. >>> sent1 = ['John','sees','Mary', '.']
  318. >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
  319. >>>
  320. >>> # Parse a single sentence.
  321. >>> parsed_sent1 = mp.parse_one(sent1)
  322. >>> parsed_sent2 = mp.parse_one(sent2)
  323. >>> print (parsed_sent1.tree())
  324. (sees John Mary .)
  325. >>> print (parsed_sent2.tree())
  326. (walks John (dog a) .)
  327. >>>
  328. >>> # Parsing multiple sentences.
  329. >>> sentences = [sent1,sent2]
  330. >>> parsed_sents = mp.parse_sents(sentences)
  331. >>> print(next(next(parsed_sents)).tree())
  332. (sees John Mary .)
  333. >>> print(next(next(parsed_sents)).tree())
  334. (walks John (dog a) .)
  335. >>>
  336. >>> # Initialize a MaltParser object with an English pre-trained model.
  337. >>> parser_dirname = 'maltparser-1.7.2'
  338. >>> model_name = 'engmalt.linear-1.7.mco'
  339. >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
  340. >>> sent1 = 'I shot an elephant in my pajamas .'.split()
  341. >>> sent2 = 'Time flies like banana .'.split()
  342. >>> # Parse a single sentence.
  343. >>> print(mp.parse_one(sent1).tree())
  344. (shot I (elephant an) (in (pajamas my)) .)
  345. # Parsing multiple sentences
  346. >>> sentences = [sent1,sent2]
  347. >>> parsed_sents = mp.parse_sents(sentences)
  348. >>> print(next(next(parsed_sents)).tree())
  349. (shot I (elephant an) (in (pajamas my)) .)
  350. >>> print(next(next(parsed_sents)).tree())
  351. (flies Time (like banana) .)
  352. '''
  353. import doctest
  354. doctest.testmod()