123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Interface to MaltParser
- #
- # Author: Dan Garrette <dhgarrette@gmail.com>
- # Contributor: Liling Tan, Mustufain, osamamukhtar11
- #
- # Copyright (C) 2001-2019 NLTK Project
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function, unicode_literals
- import os
- import sys
- import tempfile
- import subprocess
- import inspect
- from six import text_type
- from nltk.data import ZipFilePathPointer
- from nltk.internals import find_dir, find_file, find_jars_within_path
- from nltk.parse.api import ParserI
- from nltk.parse.dependencygraph import DependencyGraph
- from nltk.parse.util import taggedsents_to_conll
- def malt_regex_tagger():
- from nltk.tag import RegexpTagger
- _tagger = RegexpTagger(
- [
- (r'\.$', '.'),
- (r'\,$', ','),
- (r'\?$', '?'), # fullstop, comma, Qmark
- (r'\($', '('),
- (r'\)$', ')'), # round brackets
- (r'\[$', '['),
- (r'\]$', ']'), # square brackets
- (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
- (r'(The|the|A|a|An|an)$', 'DT'), # articles
- (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
- (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive
- (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive
- (r'(on|On|in|In|at|At|since|Since)$', 'IN'), # time prepopsitions
- (r'(for|For|ago|Ago|before|Before)$', 'IN'), # time prepopsitions
- (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions
- (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions
- (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions
- (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions
- (r'(across|Across|through|Through)$', 'IN'), # space prepopsitions
- (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions
- (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions
- (r'.*able$', 'JJ'), # adjectives
- (r'.*ness$', 'NN'), # nouns formed from adjectives
- (r'.*ly$', 'RB'), # adverbs
- (r'.*s$', 'NNS'), # plural nouns
- (r'.*ing$', 'VBG'), # gerunds
- (r'.*ed$', 'VBD'), # past tense verbs
- (r'.*', 'NN'), # nouns (default)
- ]
- )
- return _tagger.tag
- def find_maltparser(parser_dirname):
- """
- A module to find MaltParser .jar file and its dependencies.
- """
- if os.path.exists(parser_dirname): # If a full path is given.
- _malt_dir = parser_dirname
- else: # Try to find path to maltparser directory in environment variables.
- _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',))
- # Checks that that the found directory contains all the necessary .jar
- malt_dependencies = ['', '', '']
- _malt_jars = set(find_jars_within_path(_malt_dir))
- _jars = set(os.path.split(jar)[1] for jar in _malt_jars)
- malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar'])
- assert malt_dependencies.issubset(_jars)
- assert any(
- filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)
- )
- return list(_malt_jars)
- def find_malt_model(model_filename):
- """
- A module to find pre-trained MaltParser model.
- """
- if model_filename is None:
- return 'malt_temp.mco'
- elif os.path.exists(model_filename): # If a full path is given.
- return model_filename
- else: # Try to find path to malt model in environment variables.
- return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False)
- class MaltParser(ParserI):
- """
- A class for dependency parsing with MaltParser. The input is the paths to:
- - a maltparser directory
- - (optionally) the path to a pre-trained MaltParser .mco model file
- - (optionally) the tagger to use for POS tagging before parsing
- - (optionally) additional Java arguments
- Example:
- >>> from nltk.parse import malt
- >>> # With MALT_PARSER and MALT_MODEL environment set.
- >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP
- >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
- (shot I (elephant an) (in (pajamas my)) .)
- >>> # Without MALT_PARSER and MALT_MODEL environment.
- >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP
- >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP
- (shot I (elephant an) (in (pajamas my)) .)
- """
- def __init__(
- self,
- parser_dirname,
- model_filename=None,
- tagger=None,
- additional_java_args=None,
- ):
- """
- An interface for parsing with the Malt Parser.
- :param parser_dirname: The path to the maltparser directory that
- contains the maltparser-1.x.jar
- :type parser_dirname: str
- :param model_filename: The name of the pre-trained model with .mco file
- extension. If provided, training will not be required.
- (see http://www.maltparser.org/mco/mco.html and
- see http://www.patful.com/chalk/node/185)
- :type model_filename: str
- :param tagger: The tagger used to POS tag the raw string before
- formatting to CONLL format. It should behave like `nltk.pos_tag`
- :type tagger: function
- :param additional_java_args: This is the additional Java arguments that
- one can use when calling Maltparser, usually this is the heapsize
- limits, e.g. `additional_java_args=['-Xmx1024m']`
- (see http://goo.gl/mpDBvQ)
- :type additional_java_args: list
- """
- # Find all the necessary jar files for MaltParser.
- self.malt_jars = find_maltparser(parser_dirname)
- # Initialize additional java arguments.
- self.additional_java_args = (
- additional_java_args if additional_java_args is not None else []
- )
- # Initialize model.
- self.model = find_malt_model(model_filename)
- self._trained = self.model != 'malt_temp.mco'
- # Set the working_dir parameters i.e. `-w` from MaltParser's option.
- self.working_dir = tempfile.gettempdir()
- # Initialize POS tagger.
- self.tagger = tagger if tagger is not None else malt_regex_tagger()
- def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'):
- """
- Use MaltParser to parse multiple POS tagged sentences. Takes multiple
- sentences where each sentence is a list of (word, tag) tuples.
- The sentences must have already been tokenized and tagged.
- :param sentences: Input sentences to parse
- :type sentence: list(list(tuple(str, str)))
- :return: iter(iter(``DependencyGraph``)) the dependency graph
- representation of each sentence
- """
- if not self._trained:
- raise Exception("Parser has not been trained. Call train() first.")
- with tempfile.NamedTemporaryFile(
- prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False
- ) as input_file:
- with tempfile.NamedTemporaryFile(
- prefix='malt_output.conll.',
- dir=self.working_dir,
- mode='w',
- delete=False,
- ) as output_file:
- # Convert list of sentences to CONLL format.
- for line in taggedsents_to_conll(sentences):
- input_file.write(text_type(line))
- input_file.close()
- # Generate command to run maltparser.
- cmd = self.generate_malt_command(
- input_file.name, output_file.name, mode="parse"
- )
- # This is a maltparser quirk, it needs to be run
- # where the model file is. otherwise it goes into an awkward
- # missing .jars or strange -w working_dir problem.
- _current_path = os.getcwd() # Remembers the current path.
- try: # Change to modelfile path
- os.chdir(os.path.split(self.model)[0])
- except:
- pass
- ret = self._execute(cmd, verbose) # Run command.
- os.chdir(_current_path) # Change back to current path.
- if ret is not 0:
- raise Exception(
- "MaltParser parsing (%s) failed with exit "
- "code %d" % (' '.join(cmd), ret)
- )
- # Must return iter(iter(Tree))
- with open(output_file.name) as infile:
- for tree_str in infile.read().split('\n\n'):
- yield (
- iter(
- [
- DependencyGraph(
- tree_str, top_relation_label=top_relation_label
- )
- ]
- )
- )
- os.remove(input_file.name)
- os.remove(output_file.name)
- def parse_sents(self, sentences, verbose=False, top_relation_label='null'):
- """
- Use MaltParser to parse multiple sentences.
- Takes a list of sentences, where each sentence is a list of words.
- Each sentence will be automatically tagged with this
- MaltParser instance's tagger.
- :param sentences: Input sentences to parse
- :type sentence: list(list(str))
- :return: iter(DependencyGraph)
- """
- tagged_sentences = (self.tagger(sentence) for sentence in sentences)
- return self.parse_tagged_sents(
- tagged_sentences, verbose, top_relation_label=top_relation_label
- )
- def generate_malt_command(self, inputfilename, outputfilename=None, mode=None):
- """
- This function generates the maltparser command use at the terminal.
- :param inputfilename: path to the input file
- :type inputfilename: str
- :param outputfilename: path to the output file
- :type outputfilename: str
- """
- cmd = ['java']
- cmd += self.additional_java_args # Adds additional java arguments
- # Joins classpaths with ";" if on Windows and on Linux/Mac use ":"
- classpaths_separator = ';' if sys.platform.startswith('win') else ':'
- cmd += [
- '-cp',
- classpaths_separator.join(self.malt_jars),
- ] # Adds classpaths for jars
- cmd += ['org.maltparser.Malt'] # Adds the main function.
- # Adds the model file.
- if os.path.exists(self.model): # when parsing
- cmd += ['-c', os.path.split(self.model)[-1]]
- else: # when learning
- cmd += ['-c', self.model]
- cmd += ['-i', inputfilename]
- if mode == 'parse':
- cmd += ['-o', outputfilename]
- cmd += ['-m', mode] # mode use to generate parses.
- return cmd
- @staticmethod
- def _execute(cmd, verbose=False):
- output = None if verbose else subprocess.PIPE
- p = subprocess.Popen(cmd, stdout=output, stderr=output)
- return p.wait()
- def train(self, depgraphs, verbose=False):
- """
- Train MaltParser from a list of ``DependencyGraph`` objects
- :param depgraphs: list of ``DependencyGraph`` objects for training input data
- :type depgraphs: DependencyGraph
- """
- # Write the conll_str to malt_train.conll file in /tmp/
- with tempfile.NamedTemporaryFile(
- prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
- ) as input_file:
- input_str = '\n'.join(dg.to_conll(10) for dg in depgraphs)
- input_file.write(text_type(input_str))
- # Trains the model with the malt_train.conll
- self.train_from_file(input_file.name, verbose=verbose)
- # Removes the malt_train.conll once training finishes.
- os.remove(input_file.name)
- def train_from_file(self, conll_file, verbose=False):
- """
- Train MaltParser from a file
- :param conll_file: str for the filename of the training input data
- :type conll_file: str
- """
- # If conll_file is a ZipFilePathPointer,
- # then we need to do some extra massaging
- if isinstance(conll_file, ZipFilePathPointer):
- with tempfile.NamedTemporaryFile(
- prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False
- ) as input_file:
- with conll_file.open() as conll_input_file:
- conll_str = conll_input_file.read()
- input_file.write(text_type(conll_str))
- return self.train_from_file(input_file.name, verbose=verbose)
- # Generate command to run maltparser.
- cmd = self.generate_malt_command(conll_file, mode="learn")
- ret = self._execute(cmd, verbose)
- if ret != 0:
- raise Exception(
- "MaltParser training (%s) failed with exit "
- "code %d" % (' '.join(cmd), ret)
- )
- self._trained = True
- if __name__ == '__main__':
- '''
- A demonstration function to show how NLTK users can use the malt parser API.
- >>> from nltk import pos_tag
- >>> assert 'MALT_PARSER' in os.environ, str(
- ... "Please set MALT_PARSER in your global environment, e.g.:\n"
- ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'")
- >>>
- >>> assert 'MALT_MODEL' in os.environ, str(
- ... "Please set MALT_MODEL in your global environment, e.g.:\n"
- ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'")
- >>>
- >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
- ... "2 sees _ VB _ _ 0 ROOT _ _\n"
- ... "3 a _ DT _ _ 4 SPEC _ _\n"
- ... "4 dog _ NN _ _ 2 OBJ _ _\n"
- ... "5 . _ . _ _ 2 PUNCT _ _\n")
- >>>
- >>>
- >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n"
- ... "2 walks _ VB _ _ 0 ROOT _ _\n"
- ... "3 . _ . _ _ 2 PUNCT _ _\n")
- >>> dg1 = DependencyGraph(_dg1_str)
- >>> dg2 = DependencyGraph(_dg2_str)
- >>> # Initialize a MaltParser object
- >>> parser_dirname = 'maltparser-1.7.2'
- >>> mp = MaltParser(parser_dirname=parser_dirname)
- >>>
- >>> # Trains a model.
- >>> mp.train([dg1,dg2], verbose=False)
- >>> sent1 = ['John','sees','Mary', '.']
- >>> sent2 = ['John', 'walks', 'a', 'dog', '.']
- >>>
- >>> # Parse a single sentence.
- >>> parsed_sent1 = mp.parse_one(sent1)
- >>> parsed_sent2 = mp.parse_one(sent2)
- >>> print (parsed_sent1.tree())
- (sees John Mary .)
- >>> print (parsed_sent2.tree())
- (walks John (dog a) .)
- >>>
- >>> # Parsing multiple sentences.
- >>> sentences = [sent1,sent2]
- >>> parsed_sents = mp.parse_sents(sentences)
- >>> print(next(next(parsed_sents)).tree())
- (sees John Mary .)
- >>> print(next(next(parsed_sents)).tree())
- (walks John (dog a) .)
- >>>
- >>> # Initialize a MaltParser object with an English pre-trained model.
- >>> parser_dirname = 'maltparser-1.7.2'
- >>> model_name = 'engmalt.linear-1.7.mco'
- >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag)
- >>> sent1 = 'I shot an elephant in my pajamas .'.split()
- >>> sent2 = 'Time flies like banana .'.split()
- >>> # Parse a single sentence.
- >>> print(mp.parse_one(sent1).tree())
- (shot I (elephant an) (in (pajamas my)) .)
- # Parsing multiple sentences
- >>> sentences = [sent1,sent2]
- >>> parsed_sents = mp.parse_sents(sentences)
- >>> print(next(next(parsed_sents)).tree())
- (shot I (elephant an) (in (pajamas my)) .)
- >>> print(next(next(parsed_sents)).tree())
- (flies Time (like banana) .)
- '''
- import doctest
- doctest.testmod()
|