123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190 |
- # Natural Language Toolkit: Interface to Megam Classifier
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A set of functions used to interface with the external megam_ maxent
- optimization package. Before megam can be used, you should tell NLTK where it
- can find the megam binary, using the ``config_megam()`` function. Typical
- usage:
- >>> from nltk.classify import megam
- >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
- [Found megam: ...]
- Use with MaxentClassifier. Example below, see MaxentClassifier documentation
- for details.
- nltk.classify.MaxentClassifier.train(corpus, 'megam')
- .. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html
- """
- from __future__ import print_function
- import subprocess
- from six import string_types
- from nltk import compat
- from nltk.internals import find_binary
- try:
- import numpy
- except ImportError:
- numpy = None
- ######################################################################
- # { Configuration
- ######################################################################
- _megam_bin = None
- def config_megam(bin=None):
- """
- Configure NLTK's interface to the ``megam`` maxent optimization
- package.
- :param bin: The full path to the ``megam`` binary. If not specified,
- then nltk will search the system for a ``megam`` binary; and if
- one is not found, it will raise a ``LookupError`` exception.
- :type bin: str
- """
- global _megam_bin
- _megam_bin = find_binary(
- 'megam',
- bin,
- env_vars=['MEGAM'],
- binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'],
- url='http://www.umiacs.umd.edu/~hal/megam/index.html',
- )
- ######################################################################
- # { Megam Interface Functions
- ######################################################################
- def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
- """
- Generate an input file for ``megam`` based on the given corpus of
- classified tokens.
- :type train_toks: list(tuple(dict, str))
- :param train_toks: Training data, represented as a list of
- pairs, the first member of which is a feature dictionary,
- and the second of which is a classification label.
- :type encoding: MaxentFeatureEncodingI
- :param encoding: A feature encoding, used to convert featuresets
- into feature vectors. May optionally implement a cost() method
- in order to assign different costs to different class predictions.
- :type stream: stream
- :param stream: The stream to which the megam input file should be
- written.
- :param bernoulli: If true, then use the 'bernoulli' format. I.e.,
- all joint features have binary values, and are listed iff they
- are true. Otherwise, list feature values explicitly. If
- ``bernoulli=False``, then you must call ``megam`` with the
- ``-fvals`` option.
- :param explicit: If true, then use the 'explicit' format. I.e.,
- list the features that would fire for any of the possible
- labels, for each token. If ``explicit=True``, then you must
- call ``megam`` with the ``-explicit`` option.
- """
- # Look up the set of labels.
- labels = encoding.labels()
- labelnum = dict((label, i) for (i, label) in enumerate(labels))
- # Write the file, which contains one line per instance.
- for featureset, label in train_toks:
- # First, the instance number (or, in the weighted multiclass case, the cost of each label).
- if hasattr(encoding, 'cost'):
- stream.write(
- ':'.join(str(encoding.cost(featureset, label, l)) for l in labels)
- )
- else:
- stream.write('%d' % labelnum[label])
- # For implicit file formats, just list the features that fire
- # for this instance's actual label.
- if not explicit:
- _write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
- # For explicit formats, list the features that would fire for
- # any of the possible labels.
- else:
- for l in labels:
- stream.write(' #')
- _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
- # End of the instance.
- stream.write('\n')
- def parse_megam_weights(s, features_count, explicit=True):
- """
- Given the stdout output generated by ``megam`` when training a
- model, return a ``numpy`` array containing the corresponding weight
- vector. This function does not currently handle bias features.
- """
- if numpy is None:
- raise ValueError('This function requires that numpy be installed')
- assert explicit, 'non-explicit not supported yet'
- lines = s.strip().split('\n')
- weights = numpy.zeros(features_count, 'd')
- for line in lines:
- if line.strip():
- fid, weight = line.split()
- weights[int(fid)] = float(weight)
- return weights
- def _write_megam_features(vector, stream, bernoulli):
- if not vector:
- raise ValueError(
- 'MEGAM classifier requires the use of an ' 'always-on feature.'
- )
- for (fid, fval) in vector:
- if bernoulli:
- if fval == 1:
- stream.write(' %s' % fid)
- elif fval != 0:
- raise ValueError(
- 'If bernoulli=True, then all' 'features must be binary.'
- )
- else:
- stream.write(' %s %s' % (fid, fval))
- def call_megam(args):
- """
- Call the ``megam`` binary with the given arguments.
- """
- if isinstance(args, string_types):
- raise TypeError('args should be a list of strings')
- if _megam_bin is None:
- config_megam()
- # Call megam via a subprocess
- cmd = [_megam_bin] + args
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
- (stdout, stderr) = p.communicate()
- # Check the return code.
- if p.returncode != 0:
- print()
- print(stderr)
- raise OSError('megam command failed!')
- if isinstance(stdout, string_types):
- return stdout
- else:
- return stdout.decode('utf-8')
|