maxent.py 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581
  1. # Natural Language Toolkit: Maximum Entropy Classifiers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. A classifier model based on maximum entropy modeling framework. This
  10. framework considers all of the probability distributions that are
  11. empirically consistent with the training data; and chooses the
  12. distribution with the highest entropy. A probability distribution is
  13. "empirically consistent" with a set of training data if its estimated
  14. frequency with which a class and a feature vector value co-occur is
  15. equal to the actual frequency in the data.
  16. Terminology: 'feature'
  17. ======================
  18. The term *feature* is usually used to refer to some property of an
  19. unlabeled token. For example, when performing word sense
  20. disambiguation, we might define a ``'prevword'`` feature whose value is
  21. the word preceding the target word. However, in the context of
  22. maxent modeling, the term *feature* is typically used to refer to a
  23. property of a "labeled" token. In order to prevent confusion, we
  24. will introduce two distinct terms to disambiguate these two different
  25. concepts:
  26. - An "input-feature" is a property of an unlabeled token.
  27. - A "joint-feature" is a property of a labeled token.
  28. In the rest of the ``nltk.classify`` module, the term "features" is
  29. used to refer to what we will call "input-features" in this module.
  30. In literature that describes and discusses maximum entropy models,
  31. input-features are typically called "contexts", and joint-features
  32. are simply referred to as "features".
  33. Converting Input-Features to Joint-Features
  34. -------------------------------------------
  35. In maximum entropy models, joint-features are required to have numeric
  36. values. Typically, each input-feature ``input_feat`` is mapped to a
  37. set of joint-features of the form:
  38. | joint_feat(token, label) = { 1 if input_feat(token) == feat_val
  39. | { and label == some_label
  40. | {
  41. | { 0 otherwise
  42. For all values of ``feat_val`` and ``some_label``. This mapping is
  43. performed by classes that implement the ``MaxentFeatureEncodingI``
  44. interface.
  45. """
  46. from __future__ import print_function, unicode_literals
  47. try:
  48. import numpy
  49. except ImportError:
  50. pass
  51. import tempfile
  52. import os
  53. from collections import defaultdict
  54. from six import integer_types
  55. from nltk import compat
  56. from nltk.data import gzip_open_unicode
  57. from nltk.util import OrderedDict
  58. from nltk.probability import DictionaryProbDist
  59. from nltk.classify.api import ClassifierI
  60. from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
  61. from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights
  62. from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
  63. __docformat__ = 'epytext en'
  64. ######################################################################
  65. # { Classifier Model
  66. ######################################################################
  67. @compat.python_2_unicode_compatible
  68. class MaxentClassifier(ClassifierI):
  69. """
  70. A maximum entropy classifier (also known as a "conditional
  71. exponential classifier"). This classifier is parameterized by a
  72. set of "weights", which are used to combine the joint-features
  73. that are generated from a featureset by an "encoding". In
  74. particular, the encoding maps each ``(featureset, label)`` pair to
  75. a vector. The probability of each label is then computed using
  76. the following equation::
  77. dotprod(weights, encode(fs,label))
  78. prob(fs|label) = ---------------------------------------------------
  79. sum(dotprod(weights, encode(fs,l)) for l in labels)
  80. Where ``dotprod`` is the dot product::
  81. dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
  82. """
  83. def __init__(self, encoding, weights, logarithmic=True):
  84. """
  85. Construct a new maxent classifier model. Typically, new
  86. classifier models are created using the ``train()`` method.
  87. :type encoding: MaxentFeatureEncodingI
  88. :param encoding: An encoding that is used to convert the
  89. featuresets that are given to the ``classify`` method into
  90. joint-feature vectors, which are used by the maxent
  91. classifier model.
  92. :type weights: list of float
  93. :param weights: The feature weight vector for this classifier.
  94. :type logarithmic: bool
  95. :param logarithmic: If false, then use non-logarithmic weights.
  96. """
  97. self._encoding = encoding
  98. self._weights = weights
  99. self._logarithmic = logarithmic
  100. # self._logarithmic = False
  101. assert encoding.length() == len(weights)
  102. def labels(self):
  103. return self._encoding.labels()
  104. def set_weights(self, new_weights):
  105. """
  106. Set the feature weight vector for this classifier.
  107. :param new_weights: The new feature weight vector.
  108. :type new_weights: list of float
  109. """
  110. self._weights = new_weights
  111. assert self._encoding.length() == len(new_weights)
  112. def weights(self):
  113. """
  114. :return: The feature weight vector for this classifier.
  115. :rtype: list of float
  116. """
  117. return self._weights
  118. def classify(self, featureset):
  119. return self.prob_classify(featureset).max()
  120. def prob_classify(self, featureset):
  121. prob_dict = {}
  122. for label in self._encoding.labels():
  123. feature_vector = self._encoding.encode(featureset, label)
  124. if self._logarithmic:
  125. total = 0.0
  126. for (f_id, f_val) in feature_vector:
  127. total += self._weights[f_id] * f_val
  128. prob_dict[label] = total
  129. else:
  130. prod = 1.0
  131. for (f_id, f_val) in feature_vector:
  132. prod *= self._weights[f_id] ** f_val
  133. prob_dict[label] = prod
  134. # Normalize the dictionary to give a probability distribution
  135. return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)
  136. def explain(self, featureset, columns=4):
  137. """
  138. Print a table showing the effect of each of the features in
  139. the given feature set, and how they combine to determine the
  140. probabilities of each label for that featureset.
  141. """
  142. descr_width = 50
  143. TEMPLATE = ' %-' + str(descr_width - 2) + 's%s%8.3f'
  144. pdist = self.prob_classify(featureset)
  145. labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
  146. labels = labels[:columns]
  147. print(
  148. ' Feature'.ljust(descr_width)
  149. + ''.join('%8s' % (("%s" % l)[:7]) for l in labels)
  150. )
  151. print(' ' + '-' * (descr_width - 2 + 8 * len(labels)))
  152. sums = defaultdict(int)
  153. for i, label in enumerate(labels):
  154. feature_vector = self._encoding.encode(featureset, label)
  155. feature_vector.sort(
  156. key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
  157. )
  158. for (f_id, f_val) in feature_vector:
  159. if self._logarithmic:
  160. score = self._weights[f_id] * f_val
  161. else:
  162. score = self._weights[f_id] ** f_val
  163. descr = self._encoding.describe(f_id)
  164. descr = descr.split(' and label is ')[0] # hack
  165. descr += ' (%s)' % f_val # hack
  166. if len(descr) > 47:
  167. descr = descr[:44] + '...'
  168. print(TEMPLATE % (descr, i * 8 * ' ', score))
  169. sums[label] += score
  170. print(' ' + '-' * (descr_width - 1 + 8 * len(labels)))
  171. print(
  172. ' TOTAL:'.ljust(descr_width) + ''.join('%8.3f' % sums[l] for l in labels)
  173. )
  174. print(
  175. ' PROBS:'.ljust(descr_width)
  176. + ''.join('%8.3f' % pdist.prob(l) for l in labels)
  177. )
  178. def most_informative_features(self, n=10):
  179. """
  180. Generates the ranked list of informative features from most to least.
  181. """
  182. if hasattr(self, '_most_informative_features'):
  183. return self._most_informative_features[:n]
  184. else:
  185. self._most_informative_features = sorted(
  186. list(range(len(self._weights))),
  187. key=lambda fid: abs(self._weights[fid]),
  188. reverse=True,
  189. )
  190. return self._most_informative_features[:n]
  191. def show_most_informative_features(self, n=10, show='all'):
  192. """
  193. :param show: all, neg, or pos (for negative-only or positive-only)
  194. :type show: str
  195. :param n: The no. of top features
  196. :type n: int
  197. """
  198. # Use None the full list of ranked features.
  199. fids = self.most_informative_features(None)
  200. if show == 'pos':
  201. fids = [fid for fid in fids if self._weights[fid] > 0]
  202. elif show == 'neg':
  203. fids = [fid for fid in fids if self._weights[fid] < 0]
  204. for fid in fids[:n]:
  205. print('%8.3f %s' % (self._weights[fid], self._encoding.describe(fid)))
  206. def __repr__(self):
  207. return '<ConditionalExponentialClassifier: %d labels, %d features>' % (
  208. len(self._encoding.labels()),
  209. self._encoding.length(),
  210. )
  211. #: A list of the algorithm names that are accepted for the
  212. #: ``train()`` method's ``algorithm`` parameter.
  213. ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']
  214. @classmethod
  215. def train(
  216. cls,
  217. train_toks,
  218. algorithm=None,
  219. trace=3,
  220. encoding=None,
  221. labels=None,
  222. gaussian_prior_sigma=0,
  223. **cutoffs
  224. ):
  225. """
  226. Train a new maxent classifier based on the given corpus of
  227. training samples. This classifier will have its weights
  228. chosen to maximize entropy while remaining empirically
  229. consistent with the training corpus.
  230. :rtype: MaxentClassifier
  231. :return: The new maxent classifier
  232. :type train_toks: list
  233. :param train_toks: Training data, represented as a list of
  234. pairs, the first member of which is a featureset,
  235. and the second of which is a classification label.
  236. :type algorithm: str
  237. :param algorithm: A case-insensitive string, specifying which
  238. algorithm should be used to train the classifier. The
  239. following algorithms are currently available.
  240. - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
  241. Improved Iterative Scaling (``'IIS'``)
  242. - External Libraries (requiring megam):
  243. LM-BFGS algorithm, with training performed by Megam (``'megam'``)
  244. The default algorithm is ``'IIS'``.
  245. :type trace: int
  246. :param trace: The level of diagnostic tracing output to produce.
  247. Higher values produce more verbose output.
  248. :type encoding: MaxentFeatureEncodingI
  249. :param encoding: A feature encoding, used to convert featuresets
  250. into feature vectors. If none is specified, then a
  251. ``BinaryMaxentFeatureEncoding`` will be built based on the
  252. features that are attested in the training corpus.
  253. :type labels: list(str)
  254. :param labels: The set of possible labels. If none is given, then
  255. the set of all labels attested in the training data will be
  256. used instead.
  257. :param gaussian_prior_sigma: The sigma value for a gaussian
  258. prior on model weights. Currently, this is supported by
  259. ``megam``. For other algorithms, its value is ignored.
  260. :param cutoffs: Arguments specifying various conditions under
  261. which the training should be halted. (Some of the cutoff
  262. conditions are not supported by some algorithms.)
  263. - ``max_iter=v``: Terminate after ``v`` iterations.
  264. - ``min_ll=v``: Terminate after the negative average
  265. log-likelihood drops under ``v``.
  266. - ``min_lldelta=v``: Terminate if a single iteration improves
  267. log likelihood by less than ``v``.
  268. """
  269. if algorithm is None:
  270. algorithm = 'iis'
  271. for key in cutoffs:
  272. if key not in (
  273. 'max_iter',
  274. 'min_ll',
  275. 'min_lldelta',
  276. 'max_acc',
  277. 'min_accdelta',
  278. 'count_cutoff',
  279. 'norm',
  280. 'explicit',
  281. 'bernoulli',
  282. ):
  283. raise TypeError('Unexpected keyword arg %r' % key)
  284. algorithm = algorithm.lower()
  285. if algorithm == 'iis':
  286. return train_maxent_classifier_with_iis(
  287. train_toks, trace, encoding, labels, **cutoffs
  288. )
  289. elif algorithm == 'gis':
  290. return train_maxent_classifier_with_gis(
  291. train_toks, trace, encoding, labels, **cutoffs
  292. )
  293. elif algorithm == 'megam':
  294. return train_maxent_classifier_with_megam(
  295. train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
  296. )
  297. elif algorithm == 'tadm':
  298. kwargs = cutoffs
  299. kwargs['trace'] = trace
  300. kwargs['encoding'] = encoding
  301. kwargs['labels'] = labels
  302. kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma
  303. return TadmMaxentClassifier.train(train_toks, **kwargs)
  304. else:
  305. raise ValueError('Unknown algorithm %s' % algorithm)
  306. #: Alias for MaxentClassifier.
  307. ConditionalExponentialClassifier = MaxentClassifier
  308. ######################################################################
  309. # { Feature Encodings
  310. ######################################################################
  311. class MaxentFeatureEncodingI(object):
  312. """
  313. A mapping that converts a set of input-feature values to a vector
  314. of joint-feature values, given a label. This conversion is
  315. necessary to translate featuresets into a format that can be used
  316. by maximum entropy models.
  317. The set of joint-features used by a given encoding is fixed, and
  318. each index in the generated joint-feature vectors corresponds to a
  319. single joint-feature. The length of the generated joint-feature
  320. vectors is therefore constant (for a given encoding).
  321. Because the joint-feature vectors generated by
  322. ``MaxentFeatureEncodingI`` are typically very sparse, they are
  323. represented as a list of ``(index, value)`` tuples, specifying the
  324. value of each non-zero joint-feature.
  325. Feature encodings are generally created using the ``train()``
  326. method, which generates an appropriate encoding based on the
  327. input-feature values and labels that are present in a given
  328. corpus.
  329. """
  330. def encode(self, featureset, label):
  331. """
  332. Given a (featureset, label) pair, return the corresponding
  333. vector of joint-feature values. This vector is represented as
  334. a list of ``(index, value)`` tuples, specifying the value of
  335. each non-zero joint-feature.
  336. :type featureset: dict
  337. :rtype: list(tuple(int, int))
  338. """
  339. raise NotImplementedError()
  340. def length(self):
  341. """
  342. :return: The size of the fixed-length joint-feature vectors
  343. that are generated by this encoding.
  344. :rtype: int
  345. """
  346. raise NotImplementedError()
  347. def labels(self):
  348. """
  349. :return: A list of the \"known labels\" -- i.e., all labels
  350. ``l`` such that ``self.encode(fs,l)`` can be a nonzero
  351. joint-feature vector for some value of ``fs``.
  352. :rtype: list
  353. """
  354. raise NotImplementedError()
  355. def describe(self, fid):
  356. """
  357. :return: A string describing the value of the joint-feature
  358. whose index in the generated feature vectors is ``fid``.
  359. :rtype: str
  360. """
  361. raise NotImplementedError()
  362. def train(cls, train_toks):
  363. """
  364. Construct and return new feature encoding, based on a given
  365. training corpus ``train_toks``.
  366. :type train_toks: list(tuple(dict, str))
  367. :param train_toks: Training data, represented as a list of
  368. pairs, the first member of which is a feature dictionary,
  369. and the second of which is a classification label.
  370. """
  371. raise NotImplementedError()
  372. class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
  373. """
  374. A feature encoding that calls a user-supplied function to map a
  375. given featureset/label pair to a sparse joint-feature vector.
  376. """
  377. def __init__(self, func, length, labels):
  378. """
  379. Construct a new feature encoding based on the given function.
  380. :type func: (callable)
  381. :param func: A function that takes two arguments, a featureset
  382. and a label, and returns the sparse joint feature vector
  383. that encodes them::
  384. func(featureset, label) -> feature_vector
  385. This sparse joint feature vector (``feature_vector``) is a
  386. list of ``(index,value)`` tuples.
  387. :type length: int
  388. :param length: The size of the fixed-length joint-feature
  389. vectors that are generated by this encoding.
  390. :type labels: list
  391. :param labels: A list of the \"known labels\" for this
  392. encoding -- i.e., all labels ``l`` such that
  393. ``self.encode(fs,l)`` can be a nonzero joint-feature vector
  394. for some value of ``fs``.
  395. """
  396. self._length = length
  397. self._func = func
  398. self._labels = labels
  399. def encode(self, featureset, label):
  400. return self._func(featureset, label)
  401. def length(self):
  402. return self._length
  403. def labels(self):
  404. return self._labels
  405. def describe(self, fid):
  406. return 'no description available'
  407. class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
  408. """
  409. A feature encoding that generates vectors containing a binary
  410. joint-features of the form:
  411. | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
  412. | {
  413. | { 0 otherwise
  414. Where ``fname`` is the name of an input-feature, ``fval`` is a value
  415. for that input-feature, and ``label`` is a label.
  416. Typically, these features are constructed based on a training
  417. corpus, using the ``train()`` method. This method will create one
  418. feature for each combination of ``fname``, ``fval``, and ``label``
  419. that occurs at least once in the training corpus.
  420. The ``unseen_features`` parameter can be used to add "unseen-value
  421. features", which are used whenever an input feature has a value
  422. that was not encountered in the training corpus. These features
  423. have the form:
  424. | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
  425. | { and l == label
  426. | {
  427. | { 0 otherwise
  428. Where ``is_unseen(fname, fval)`` is true if the encoding does not
  429. contain any joint features that are true when ``fs[fname]==fval``.
  430. The ``alwayson_features`` parameter can be used to add "always-on
  431. features", which have the form::
  432. | joint_feat(fs, l) = { 1 if (l == label)
  433. | {
  434. | { 0 otherwise
  435. These always-on features allow the maxent model to directly model
  436. the prior probabilities of each label.
  437. """
  438. def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
  439. """
  440. :param labels: A list of the \"known labels\" for this encoding.
  441. :param mapping: A dictionary mapping from ``(fname,fval,label)``
  442. tuples to corresponding joint-feature indexes. These
  443. indexes must be the set of integers from 0...len(mapping).
  444. If ``mapping[fname,fval,label]=id``, then
  445. ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
  446. otherwise, it is 0.
  447. :param unseen_features: If true, then include unseen value
  448. features in the generated joint-feature vectors.
  449. :param alwayson_features: If true, then include always-on
  450. features in the generated joint-feature vectors.
  451. """
  452. if set(mapping.values()) != set(range(len(mapping))):
  453. raise ValueError(
  454. 'Mapping values must be exactly the '
  455. 'set of integers from 0...len(mapping)'
  456. )
  457. self._labels = list(labels)
  458. """A list of attested labels."""
  459. self._mapping = mapping
  460. """dict mapping from (fname,fval,label) -> fid"""
  461. self._length = len(mapping)
  462. """The length of generated joint feature vectors."""
  463. self._alwayson = None
  464. """dict mapping from label -> fid"""
  465. self._unseen = None
  466. """dict mapping from fname -> fid"""
  467. if alwayson_features:
  468. self._alwayson = dict(
  469. (label, i + self._length) for (i, label) in enumerate(labels)
  470. )
  471. self._length += len(self._alwayson)
  472. if unseen_features:
  473. fnames = set(fname for (fname, fval, label) in mapping)
  474. self._unseen = dict(
  475. (fname, i + self._length) for (i, fname) in enumerate(fnames)
  476. )
  477. self._length += len(fnames)
  478. def encode(self, featureset, label):
  479. # Inherit docs.
  480. encoding = []
  481. # Convert input-features to joint-features:
  482. for fname, fval in featureset.items():
  483. # Known feature name & value:
  484. if (fname, fval, label) in self._mapping:
  485. encoding.append((self._mapping[fname, fval, label], 1))
  486. # Otherwise, we might want to fire an "unseen-value feature".
  487. elif self._unseen:
  488. # Have we seen this fname/fval combination with any label?
  489. for label2 in self._labels:
  490. if (fname, fval, label2) in self._mapping:
  491. break # we've seen this fname/fval combo
  492. # We haven't -- fire the unseen-value feature
  493. else:
  494. if fname in self._unseen:
  495. encoding.append((self._unseen[fname], 1))
  496. # Add always-on features:
  497. if self._alwayson and label in self._alwayson:
  498. encoding.append((self._alwayson[label], 1))
  499. return encoding
  500. def describe(self, f_id):
  501. # Inherit docs.
  502. if not isinstance(f_id, integer_types):
  503. raise TypeError('describe() expected an int')
  504. try:
  505. self._inv_mapping
  506. except AttributeError:
  507. self._inv_mapping = [-1] * len(self._mapping)
  508. for (info, i) in self._mapping.items():
  509. self._inv_mapping[i] = info
  510. if f_id < len(self._mapping):
  511. (fname, fval, label) = self._inv_mapping[f_id]
  512. return '%s==%r and label is %r' % (fname, fval, label)
  513. elif self._alwayson and f_id in self._alwayson.values():
  514. for (label, f_id2) in self._alwayson.items():
  515. if f_id == f_id2:
  516. return 'label is %r' % label
  517. elif self._unseen and f_id in self._unseen.values():
  518. for (fname, f_id2) in self._unseen.items():
  519. if f_id == f_id2:
  520. return '%s is unseen' % fname
  521. else:
  522. raise ValueError('Bad feature id')
  523. def labels(self):
  524. # Inherit docs.
  525. return self._labels
  526. def length(self):
  527. # Inherit docs.
  528. return self._length
  529. @classmethod
  530. def train(cls, train_toks, count_cutoff=0, labels=None, **options):
  531. """
  532. Construct and return new feature encoding, based on a given
  533. training corpus ``train_toks``. See the class description
  534. ``BinaryMaxentFeatureEncoding`` for a description of the
  535. joint-features that will be included in this encoding.
  536. :type train_toks: list(tuple(dict, str))
  537. :param train_toks: Training data, represented as a list of
  538. pairs, the first member of which is a feature dictionary,
  539. and the second of which is a classification label.
  540. :type count_cutoff: int
  541. :param count_cutoff: A cutoff value that is used to discard
  542. rare joint-features. If a joint-feature's value is 1
  543. fewer than ``count_cutoff`` times in the training corpus,
  544. then that joint-feature is not included in the generated
  545. encoding.
  546. :type labels: list
  547. :param labels: A list of labels that should be used by the
  548. classifier. If not specified, then the set of labels
  549. attested in ``train_toks`` will be used.
  550. :param options: Extra parameters for the constructor, such as
  551. ``unseen_features`` and ``alwayson_features``.
  552. """
  553. mapping = {} # maps (fname, fval, label) -> fid
  554. seen_labels = set() # The set of labels we've encountered
  555. count = defaultdict(int) # maps (fname, fval) -> count
  556. for (tok, label) in train_toks:
  557. if labels and label not in labels:
  558. raise ValueError('Unexpected label %s' % label)
  559. seen_labels.add(label)
  560. # Record each of the features.
  561. for (fname, fval) in tok.items():
  562. # If a count cutoff is given, then only add a joint
  563. # feature once the corresponding (fname, fval, label)
  564. # tuple exceeds that cutoff.
  565. count[fname, fval] += 1
  566. if count[fname, fval] >= count_cutoff:
  567. if (fname, fval, label) not in mapping:
  568. mapping[fname, fval, label] = len(mapping)
  569. if labels is None:
  570. labels = seen_labels
  571. return cls(labels, mapping, **options)
  572. class GISEncoding(BinaryMaxentFeatureEncoding):
  573. """
  574. A binary feature encoding which adds one new joint-feature to the
  575. joint-features defined by ``BinaryMaxentFeatureEncoding``: a
  576. correction feature, whose value is chosen to ensure that the
  577. sparse vector always sums to a constant non-negative number. This
  578. new feature is used to ensure two preconditions for the GIS
  579. training algorithm:
  580. - At least one feature vector index must be nonzero for every
  581. token.
  582. - The feature vector must sum to a constant non-negative number
  583. for every token.
  584. """
  585. def __init__(
  586. self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
  587. ):
  588. """
  589. :param C: The correction constant. The value of the correction
  590. feature is based on this value. In particular, its value is
  591. ``C - sum([v for (f,v) in encoding])``.
  592. :seealso: ``BinaryMaxentFeatureEncoding.__init__``
  593. """
  594. BinaryMaxentFeatureEncoding.__init__(
  595. self, labels, mapping, unseen_features, alwayson_features
  596. )
  597. if C is None:
  598. C = len(set(fname for (fname, fval, label) in mapping)) + 1
  599. self._C = C
  600. @property
  601. def C(self):
  602. """The non-negative constant that all encoded feature vectors
  603. will sum to."""
  604. return self._C
  605. def encode(self, featureset, label):
  606. # Get the basic encoding.
  607. encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
  608. base_length = BinaryMaxentFeatureEncoding.length(self)
  609. # Add a correction feature.
  610. total = sum(v for (f, v) in encoding)
  611. if total >= self._C:
  612. raise ValueError('Correction feature is not high enough!')
  613. encoding.append((base_length, self._C - total))
  614. # Return the result
  615. return encoding
  616. def length(self):
  617. return BinaryMaxentFeatureEncoding.length(self) + 1
  618. def describe(self, f_id):
  619. if f_id == BinaryMaxentFeatureEncoding.length(self):
  620. return 'Correction feature (%s)' % self._C
  621. else:
  622. return BinaryMaxentFeatureEncoding.describe(self, f_id)
  623. class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
  624. def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
  625. self._mapping = OrderedDict(mapping)
  626. self._label_mapping = OrderedDict()
  627. BinaryMaxentFeatureEncoding.__init__(
  628. self, labels, self._mapping, unseen_features, alwayson_features
  629. )
  630. def encode(self, featureset, label):
  631. encoding = []
  632. for feature, value in featureset.items():
  633. if (feature, label) not in self._mapping:
  634. self._mapping[(feature, label)] = len(self._mapping)
  635. if value not in self._label_mapping:
  636. if not isinstance(value, int):
  637. self._label_mapping[value] = len(self._label_mapping)
  638. else:
  639. self._label_mapping[value] = value
  640. encoding.append(
  641. (self._mapping[(feature, label)], self._label_mapping[value])
  642. )
  643. return encoding
  644. def labels(self):
  645. return self._labels
  646. def describe(self, fid):
  647. for (feature, label) in self._mapping:
  648. if self._mapping[(feature, label)] == fid:
  649. return (feature, label)
  650. def length(self):
  651. return len(self._mapping)
  652. @classmethod
  653. def train(cls, train_toks, count_cutoff=0, labels=None, **options):
  654. mapping = OrderedDict()
  655. if not labels:
  656. labels = []
  657. # This gets read twice, so compute the values in case it's lazy.
  658. train_toks = list(train_toks)
  659. for (featureset, label) in train_toks:
  660. if label not in labels:
  661. labels.append(label)
  662. for (featureset, label) in train_toks:
  663. for label in labels:
  664. for feature in featureset:
  665. if (feature, label) not in mapping:
  666. mapping[(feature, label)] = len(mapping)
  667. return cls(labels, mapping, **options)
  668. class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
  669. """
  670. A feature encoding that generates vectors containing integer,
  671. float and binary joint-features of the form:
  672. Binary (for string and boolean features):
  673. | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
  674. | {
  675. | { 0 otherwise
  676. Value (for integer and float features):
  677. | joint_feat(fs, l) = { fval if (fs[fname] == type(fval))
  678. | { and (l == label)
  679. | {
  680. | { not encoded otherwise
  681. Where ``fname`` is the name of an input-feature, ``fval`` is a value
  682. for that input-feature, and ``label`` is a label.
  683. Typically, these features are constructed based on a training
  684. corpus, using the ``train()`` method.
  685. For string and boolean features [type(fval) not in (int, float)]
  686. this method will create one feature for each combination of
  687. ``fname``, ``fval``, and ``label`` that occurs at least once in the
  688. training corpus.
  689. For integer and float features [type(fval) in (int, float)] this
  690. method will create one feature for each combination of ``fname``
  691. and ``label`` that occurs at least once in the training corpus.
  692. For binary features the ``unseen_features`` parameter can be used
  693. to add "unseen-value features", which are used whenever an input
  694. feature has a value that was not encountered in the training
  695. corpus. These features have the form:
  696. | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
  697. | { and l == label
  698. | {
  699. | { 0 otherwise
  700. Where ``is_unseen(fname, fval)`` is true if the encoding does not
  701. contain any joint features that are true when ``fs[fname]==fval``.
  702. The ``alwayson_features`` parameter can be used to add "always-on
  703. features", which have the form:
  704. | joint_feat(fs, l) = { 1 if (l == label)
  705. | {
  706. | { 0 otherwise
  707. These always-on features allow the maxent model to directly model
  708. the prior probabilities of each label.
  709. """
  710. def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
  711. """
  712. :param labels: A list of the \"known labels\" for this encoding.
  713. :param mapping: A dictionary mapping from ``(fname,fval,label)``
  714. tuples to corresponding joint-feature indexes. These
  715. indexes must be the set of integers from 0...len(mapping).
  716. If ``mapping[fname,fval,label]=id``, then
  717. ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
  718. otherwise, it is 0.
  719. :param unseen_features: If true, then include unseen value
  720. features in the generated joint-feature vectors.
  721. :param alwayson_features: If true, then include always-on
  722. features in the generated joint-feature vectors.
  723. """
  724. if set(mapping.values()) != set(range(len(mapping))):
  725. raise ValueError(
  726. 'Mapping values must be exactly the '
  727. 'set of integers from 0...len(mapping)'
  728. )
  729. self._labels = list(labels)
  730. """A list of attested labels."""
  731. self._mapping = mapping
  732. """dict mapping from (fname,fval,label) -> fid"""
  733. self._length = len(mapping)
  734. """The length of generated joint feature vectors."""
  735. self._alwayson = None
  736. """dict mapping from label -> fid"""
  737. self._unseen = None
  738. """dict mapping from fname -> fid"""
  739. if alwayson_features:
  740. self._alwayson = dict(
  741. (label, i + self._length) for (i, label) in enumerate(labels)
  742. )
  743. self._length += len(self._alwayson)
  744. if unseen_features:
  745. fnames = set(fname for (fname, fval, label) in mapping)
  746. self._unseen = dict(
  747. (fname, i + self._length) for (i, fname) in enumerate(fnames)
  748. )
  749. self._length += len(fnames)
  750. def encode(self, featureset, label):
  751. # Inherit docs.
  752. encoding = []
  753. # Convert input-features to joint-features:
  754. for fname, fval in featureset.items():
  755. if isinstance(fval, (integer_types, float)):
  756. # Known feature name & value:
  757. if (fname, type(fval), label) in self._mapping:
  758. encoding.append((self._mapping[fname, type(fval), label], fval))
  759. else:
  760. # Known feature name & value:
  761. if (fname, fval, label) in self._mapping:
  762. encoding.append((self._mapping[fname, fval, label], 1))
  763. # Otherwise, we might want to fire an "unseen-value feature".
  764. elif self._unseen:
  765. # Have we seen this fname/fval combination with any label?
  766. for label2 in self._labels:
  767. if (fname, fval, label2) in self._mapping:
  768. break # we've seen this fname/fval combo
  769. # We haven't -- fire the unseen-value feature
  770. else:
  771. if fname in self._unseen:
  772. encoding.append((self._unseen[fname], 1))
  773. # Add always-on features:
  774. if self._alwayson and label in self._alwayson:
  775. encoding.append((self._alwayson[label], 1))
  776. return encoding
  777. def describe(self, f_id):
  778. # Inherit docs.
  779. if not isinstance(f_id, integer_types):
  780. raise TypeError('describe() expected an int')
  781. try:
  782. self._inv_mapping
  783. except AttributeError:
  784. self._inv_mapping = [-1] * len(self._mapping)
  785. for (info, i) in self._mapping.items():
  786. self._inv_mapping[i] = info
  787. if f_id < len(self._mapping):
  788. (fname, fval, label) = self._inv_mapping[f_id]
  789. return '%s==%r and label is %r' % (fname, fval, label)
  790. elif self._alwayson and f_id in self._alwayson.values():
  791. for (label, f_id2) in self._alwayson.items():
  792. if f_id == f_id2:
  793. return 'label is %r' % label
  794. elif self._unseen and f_id in self._unseen.values():
  795. for (fname, f_id2) in self._unseen.items():
  796. if f_id == f_id2:
  797. return '%s is unseen' % fname
  798. else:
  799. raise ValueError('Bad feature id')
  800. def labels(self):
  801. # Inherit docs.
  802. return self._labels
  803. def length(self):
  804. # Inherit docs.
  805. return self._length
  806. @classmethod
  807. def train(cls, train_toks, count_cutoff=0, labels=None, **options):
  808. """
  809. Construct and return new feature encoding, based on a given
  810. training corpus ``train_toks``. See the class description
  811. ``TypedMaxentFeatureEncoding`` for a description of the
  812. joint-features that will be included in this encoding.
  813. Note: recognized feature values types are (int, float), over
  814. types are interpreted as regular binary features.
  815. :type train_toks: list(tuple(dict, str))
  816. :param train_toks: Training data, represented as a list of
  817. pairs, the first member of which is a feature dictionary,
  818. and the second of which is a classification label.
  819. :type count_cutoff: int
  820. :param count_cutoff: A cutoff value that is used to discard
  821. rare joint-features. If a joint-feature's value is 1
  822. fewer than ``count_cutoff`` times in the training corpus,
  823. then that joint-feature is not included in the generated
  824. encoding.
  825. :type labels: list
  826. :param labels: A list of labels that should be used by the
  827. classifier. If not specified, then the set of labels
  828. attested in ``train_toks`` will be used.
  829. :param options: Extra parameters for the constructor, such as
  830. ``unseen_features`` and ``alwayson_features``.
  831. """
  832. mapping = {} # maps (fname, fval, label) -> fid
  833. seen_labels = set() # The set of labels we've encountered
  834. count = defaultdict(int) # maps (fname, fval) -> count
  835. for (tok, label) in train_toks:
  836. if labels and label not in labels:
  837. raise ValueError('Unexpected label %s' % label)
  838. seen_labels.add(label)
  839. # Record each of the features.
  840. for (fname, fval) in tok.items():
  841. if type(fval) in (int, float):
  842. fval = type(fval)
  843. # If a count cutoff is given, then only add a joint
  844. # feature once the corresponding (fname, fval, label)
  845. # tuple exceeds that cutoff.
  846. count[fname, fval] += 1
  847. if count[fname, fval] >= count_cutoff:
  848. if (fname, fval, label) not in mapping:
  849. mapping[fname, fval, label] = len(mapping)
  850. if labels is None:
  851. labels = seen_labels
  852. return cls(labels, mapping, **options)
  853. ######################################################################
  854. # { Classifier Trainer: Generalized Iterative Scaling
  855. ######################################################################
  856. def train_maxent_classifier_with_gis(
  857. train_toks, trace=3, encoding=None, labels=None, **cutoffs
  858. ):
  859. """
  860. Train a new ``ConditionalExponentialClassifier``, using the given
  861. training samples, using the Generalized Iterative Scaling
  862. algorithm. This ``ConditionalExponentialClassifier`` will encode
  863. the model that maximizes entropy from all the models that are
  864. empirically consistent with ``train_toks``.
  865. :see: ``train_maxent_classifier()`` for parameter descriptions.
  866. """
  867. cutoffs.setdefault('max_iter', 100)
  868. cutoffchecker = CutoffChecker(cutoffs)
  869. # Construct an encoding from the training data.
  870. if encoding is None:
  871. encoding = GISEncoding.train(train_toks, labels=labels)
  872. if not hasattr(encoding, 'C'):
  873. raise TypeError(
  874. 'The GIS algorithm requires an encoding that '
  875. 'defines C (e.g., GISEncoding).'
  876. )
  877. # Cinv is the inverse of the sum of each joint feature vector.
  878. # This controls the learning rate: higher Cinv (or lower C) gives
  879. # faster learning.
  880. Cinv = 1.0 / encoding.C
  881. # Count how many times each feature occurs in the training data.
  882. empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
  883. # Check for any features that are not attested in train_toks.
  884. unattested = set(numpy.nonzero(empirical_fcount == 0)[0])
  885. # Build the classifier. Start with weight=0 for each attested
  886. # feature, and weight=-infinity for each unattested feature.
  887. weights = numpy.zeros(len(empirical_fcount), 'd')
  888. for fid in unattested:
  889. weights[fid] = numpy.NINF
  890. classifier = ConditionalExponentialClassifier(encoding, weights)
  891. # Take the log of the empirical fcount.
  892. log_empirical_fcount = numpy.log2(empirical_fcount)
  893. del empirical_fcount
  894. if trace > 0:
  895. print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
  896. if trace > 2:
  897. print()
  898. print(' Iteration Log Likelihood Accuracy')
  899. print(' ---------------------------------------')
  900. # Train the classifier.
  901. try:
  902. while True:
  903. if trace > 2:
  904. ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
  905. acc = cutoffchecker.acc or accuracy(classifier, train_toks)
  906. iternum = cutoffchecker.iter
  907. print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
  908. # Use the model to estimate the number of times each
  909. # feature should occur in the training data.
  910. estimated_fcount = calculate_estimated_fcount(
  911. classifier, train_toks, encoding
  912. )
  913. # Take the log of estimated fcount (avoid taking log(0).)
  914. for fid in unattested:
  915. estimated_fcount[fid] += 1
  916. log_estimated_fcount = numpy.log2(estimated_fcount)
  917. del estimated_fcount
  918. # Update the classifier weights
  919. weights = classifier.weights()
  920. weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
  921. classifier.set_weights(weights)
  922. # Check the log-likelihood & accuracy cutoffs.
  923. if cutoffchecker.check(classifier, train_toks):
  924. break
  925. except KeyboardInterrupt:
  926. print(' Training stopped: keyboard interrupt')
  927. except:
  928. raise
  929. if trace > 2:
  930. ll = log_likelihood(classifier, train_toks)
  931. acc = accuracy(classifier, train_toks)
  932. print(' Final %14.5f %9.3f' % (ll, acc))
  933. # Return the classifier.
  934. return classifier
  935. def calculate_empirical_fcount(train_toks, encoding):
  936. fcount = numpy.zeros(encoding.length(), 'd')
  937. for tok, label in train_toks:
  938. for (index, val) in encoding.encode(tok, label):
  939. fcount[index] += val
  940. return fcount
  941. def calculate_estimated_fcount(classifier, train_toks, encoding):
  942. fcount = numpy.zeros(encoding.length(), 'd')
  943. for tok, label in train_toks:
  944. pdist = classifier.prob_classify(tok)
  945. for label in pdist.samples():
  946. prob = pdist.prob(label)
  947. for (fid, fval) in encoding.encode(tok, label):
  948. fcount[fid] += prob * fval
  949. return fcount
  950. ######################################################################
  951. # { Classifier Trainer: Improved Iterative Scaling
  952. ######################################################################
  953. def train_maxent_classifier_with_iis(
  954. train_toks, trace=3, encoding=None, labels=None, **cutoffs
  955. ):
  956. """
  957. Train a new ``ConditionalExponentialClassifier``, using the given
  958. training samples, using the Improved Iterative Scaling algorithm.
  959. This ``ConditionalExponentialClassifier`` will encode the model
  960. that maximizes entropy from all the models that are empirically
  961. consistent with ``train_toks``.
  962. :see: ``train_maxent_classifier()`` for parameter descriptions.
  963. """
  964. cutoffs.setdefault('max_iter', 100)
  965. cutoffchecker = CutoffChecker(cutoffs)
  966. # Construct an encoding from the training data.
  967. if encoding is None:
  968. encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
  969. # Count how many times each feature occurs in the training data.
  970. empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)
  971. # Find the nf map, and related variables nfarray and nfident.
  972. # nf is the sum of the features for a given labeled text.
  973. # nfmap compresses this sparse set of values to a dense list.
  974. # nfarray performs the reverse operation. nfident is
  975. # nfarray multiplied by an identity matrix.
  976. nfmap = calculate_nfmap(train_toks, encoding)
  977. nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
  978. nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
  979. # Check for any features that are not attested in train_toks.
  980. unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])
  981. # Build the classifier. Start with weight=0 for each attested
  982. # feature, and weight=-infinity for each unattested feature.
  983. weights = numpy.zeros(len(empirical_ffreq), 'd')
  984. for fid in unattested:
  985. weights[fid] = numpy.NINF
  986. classifier = ConditionalExponentialClassifier(encoding, weights)
  987. if trace > 0:
  988. print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
  989. if trace > 2:
  990. print()
  991. print(' Iteration Log Likelihood Accuracy')
  992. print(' ---------------------------------------')
  993. # Train the classifier.
  994. try:
  995. while True:
  996. if trace > 2:
  997. ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
  998. acc = cutoffchecker.acc or accuracy(classifier, train_toks)
  999. iternum = cutoffchecker.iter
  1000. print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
  1001. # Calculate the deltas for this iteration, using Newton's method.
  1002. deltas = calculate_deltas(
  1003. train_toks,
  1004. classifier,
  1005. unattested,
  1006. empirical_ffreq,
  1007. nfmap,
  1008. nfarray,
  1009. nftranspose,
  1010. encoding,
  1011. )
  1012. # Use the deltas to update our weights.
  1013. weights = classifier.weights()
  1014. weights += deltas
  1015. classifier.set_weights(weights)
  1016. # Check the log-likelihood & accuracy cutoffs.
  1017. if cutoffchecker.check(classifier, train_toks):
  1018. break
  1019. except KeyboardInterrupt:
  1020. print(' Training stopped: keyboard interrupt')
  1021. except:
  1022. raise
  1023. if trace > 2:
  1024. ll = log_likelihood(classifier, train_toks)
  1025. acc = accuracy(classifier, train_toks)
  1026. print(' Final %14.5f %9.3f' % (ll, acc))
  1027. # Return the classifier.
  1028. return classifier
  1029. def calculate_nfmap(train_toks, encoding):
  1030. """
  1031. Construct a map that can be used to compress ``nf`` (which is
  1032. typically sparse).
  1033. *nf(feature_vector)* is the sum of the feature values for
  1034. *feature_vector*.
  1035. This represents the number of features that are active for a
  1036. given labeled text. This method finds all values of *nf(t)*
  1037. that are attested for at least one token in the given list of
  1038. training tokens; and constructs a dictionary mapping these
  1039. attested values to a continuous range *0...N*. For example,
  1040. if the only values of *nf()* that were attested were 3, 5, and
  1041. 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.
  1042. :return: A map that can be used to compress ``nf`` to a dense
  1043. vector.
  1044. :rtype: dict(int -> int)
  1045. """
  1046. # Map from nf to indices. This allows us to use smaller arrays.
  1047. nfset = set()
  1048. for tok, _ in train_toks:
  1049. for label in encoding.labels():
  1050. nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
  1051. return dict((nf, i) for (i, nf) in enumerate(nfset))
  1052. def calculate_deltas(
  1053. train_toks,
  1054. classifier,
  1055. unattested,
  1056. ffreq_empirical,
  1057. nfmap,
  1058. nfarray,
  1059. nftranspose,
  1060. encoding,
  1061. ):
  1062. """
  1063. Calculate the update values for the classifier weights for
  1064. this iteration of IIS. These update weights are the value of
  1065. ``delta`` that solves the equation::
  1066. ffreq_empirical[i]
  1067. =
  1068. SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
  1069. feature_vector(fs,l)[i] *
  1070. exp(delta[i] * nf(feature_vector(fs,l))))
  1071. Where:
  1072. - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
  1073. - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
  1074. - *nf(vector)* = ``sum([val for (id,val) in vector])``
  1075. This method uses Newton's method to solve this equation for
  1076. *delta[i]*. In particular, it starts with a guess of
  1077. ``delta[i]`` = 1; and iteratively updates ``delta`` with:
  1078. | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
  1079. until convergence, where *sum1* and *sum2* are defined as:
  1080. | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
  1081. | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
  1082. | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
  1083. | feature_vector(fs,l)[i] .
  1084. | exp(delta[i] . nf(feature_vector(fs,l))))
  1085. Note that *sum1* and *sum2* depend on ``delta``; so they need
  1086. to be re-computed each iteration.
  1087. The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
  1088. used to generate a dense encoding for *nf(ltext)*. This
  1089. allows ``_deltas`` to calculate *sum1* and *sum2* using
  1090. matrices, which yields a significant performance improvement.
  1091. :param train_toks: The set of training tokens.
  1092. :type train_toks: list(tuple(dict, str))
  1093. :param classifier: The current classifier.
  1094. :type classifier: ClassifierI
  1095. :param ffreq_empirical: An array containing the empirical
  1096. frequency for each feature. The *i*\ th element of this
  1097. array is the empirical frequency for feature *i*.
  1098. :type ffreq_empirical: sequence of float
  1099. :param unattested: An array that is 1 for features that are
  1100. not attested in the training data; and 0 for features that
  1101. are attested. In other words, ``unattested[i]==0`` iff
  1102. ``ffreq_empirical[i]==0``.
  1103. :type unattested: sequence of int
  1104. :param nfmap: A map that can be used to compress ``nf`` to a dense
  1105. vector.
  1106. :type nfmap: dict(int -> int)
  1107. :param nfarray: An array that can be used to uncompress ``nf``
  1108. from a dense vector.
  1109. :type nfarray: array(float)
  1110. :param nftranspose: The transpose of ``nfarray``
  1111. :type nftranspose: array(float)
  1112. """
  1113. # These parameters control when we decide that we've
  1114. # converged. It probably should be possible to set these
  1115. # manually, via keyword arguments to train.
  1116. NEWTON_CONVERGE = 1e-12
  1117. MAX_NEWTON = 300
  1118. deltas = numpy.ones(encoding.length(), 'd')
  1119. # Precompute the A matrix:
  1120. # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
  1121. # over all label,fs s.t. num_features[label,fs]=nf
  1122. A = numpy.zeros((len(nfmap), encoding.length()), 'd')
  1123. for tok, label in train_toks:
  1124. dist = classifier.prob_classify(tok)
  1125. for label in encoding.labels():
  1126. # Generate the feature vector
  1127. feature_vector = encoding.encode(tok, label)
  1128. # Find the number of active features
  1129. nf = sum(val for (id, val) in feature_vector)
  1130. # Update the A matrix
  1131. for (id, val) in feature_vector:
  1132. A[nfmap[nf], id] += dist.prob(label) * val
  1133. A /= len(train_toks)
  1134. # Iteratively solve for delta. Use the following variables:
  1135. # - nf_delta[x][y] = nfarray[x] * delta[y]
  1136. # - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
  1137. # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
  1138. # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
  1139. # exp(delta[i]nf)
  1140. # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
  1141. # nf exp(delta[i]nf)
  1142. for rangenum in range(MAX_NEWTON):
  1143. nf_delta = numpy.outer(nfarray, deltas)
  1144. exp_nf_delta = 2 ** nf_delta
  1145. nf_exp_nf_delta = nftranspose * exp_nf_delta
  1146. sum1 = numpy.sum(exp_nf_delta * A, axis=0)
  1147. sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
  1148. # Avoid division by zero.
  1149. for fid in unattested:
  1150. sum2[fid] += 1
  1151. # Update the deltas.
  1152. deltas -= (ffreq_empirical - sum1) / -sum2
  1153. # We can stop once we converge.
  1154. n_error = numpy.sum(abs((ffreq_empirical - sum1))) / numpy.sum(abs(deltas))
  1155. if n_error < NEWTON_CONVERGE:
  1156. return deltas
  1157. return deltas
  1158. ######################################################################
  1159. # { Classifier Trainer: megam
  1160. ######################################################################
  1161. # [xx] possible extension: add support for using implicit file format;
  1162. # this would need to put requirements on what encoding is used. But
  1163. # we may need this for other maxent classifier trainers that require
  1164. # implicit formats anyway.
  1165. def train_maxent_classifier_with_megam(
  1166. train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
  1167. ):
  1168. """
  1169. Train a new ``ConditionalExponentialClassifier``, using the given
  1170. training samples, using the external ``megam`` library. This
  1171. ``ConditionalExponentialClassifier`` will encode the model that
  1172. maximizes entropy from all the models that are empirically
  1173. consistent with ``train_toks``.
  1174. :see: ``train_maxent_classifier()`` for parameter descriptions.
  1175. :see: ``nltk.classify.megam``
  1176. """
  1177. explicit = True
  1178. bernoulli = True
  1179. if 'explicit' in kwargs:
  1180. explicit = kwargs['explicit']
  1181. if 'bernoulli' in kwargs:
  1182. bernoulli = kwargs['bernoulli']
  1183. # Construct an encoding from the training data.
  1184. if encoding is None:
  1185. # Count cutoff can also be controlled by megam with the -minfc
  1186. # option. Not sure where the best place for it is.
  1187. count_cutoff = kwargs.get('count_cutoff', 0)
  1188. encoding = BinaryMaxentFeatureEncoding.train(
  1189. train_toks, count_cutoff, labels=labels, alwayson_features=True
  1190. )
  1191. elif labels is not None:
  1192. raise ValueError('Specify encoding or labels, not both')
  1193. # Write a training file for megam.
  1194. try:
  1195. fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
  1196. with open(trainfile_name, 'w') as trainfile:
  1197. write_megam_file(
  1198. train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
  1199. )
  1200. os.close(fd)
  1201. except (OSError, IOError, ValueError) as e:
  1202. raise ValueError('Error while creating megam training file: %s' % e)
  1203. # Run megam on the training file.
  1204. options = []
  1205. options += ['-nobias', '-repeat', '10']
  1206. if explicit:
  1207. options += ['-explicit']
  1208. if not bernoulli:
  1209. options += ['-fvals']
  1210. if gaussian_prior_sigma:
  1211. # Lambda is just the precision of the Gaussian prior, i.e. it's the
  1212. # inverse variance, so the parameter conversion is 1.0/sigma**2.
  1213. # See http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf.
  1214. inv_variance = 1.0 / gaussian_prior_sigma ** 2
  1215. else:
  1216. inv_variance = 0
  1217. options += ['-lambda', '%.2f' % inv_variance, '-tune']
  1218. if trace < 3:
  1219. options += ['-quiet']
  1220. if 'max_iter' in kwargs:
  1221. options += ['-maxi', '%s' % kwargs['max_iter']]
  1222. if 'll_delta' in kwargs:
  1223. # [xx] this is actually a perplexity delta, not a log
  1224. # likelihood delta
  1225. options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]
  1226. if hasattr(encoding, 'cost'):
  1227. options += ['-multilabel'] # each possible la
  1228. options += ['multiclass', trainfile_name]
  1229. stdout = call_megam(options)
  1230. # print './megam_i686.opt ', ' '.join(options)
  1231. # Delete the training file
  1232. try:
  1233. os.remove(trainfile_name)
  1234. except (OSError, IOError) as e:
  1235. print('Warning: unable to delete %s: %s' % (trainfile_name, e))
  1236. # Parse the generated weight vector.
  1237. weights = parse_megam_weights(stdout, encoding.length(), explicit)
  1238. # Convert from base-e to base-2 weights.
  1239. weights *= numpy.log2(numpy.e)
  1240. # Build the classifier
  1241. return MaxentClassifier(encoding, weights)
  1242. ######################################################################
  1243. # { Classifier Trainer: tadm
  1244. ######################################################################
  1245. class TadmMaxentClassifier(MaxentClassifier):
  1246. @classmethod
  1247. def train(cls, train_toks, **kwargs):
  1248. algorithm = kwargs.get('algorithm', 'tao_lmvm')
  1249. trace = kwargs.get('trace', 3)
  1250. encoding = kwargs.get('encoding', None)
  1251. labels = kwargs.get('labels', None)
  1252. sigma = kwargs.get('gaussian_prior_sigma', 0)
  1253. count_cutoff = kwargs.get('count_cutoff', 0)
  1254. max_iter = kwargs.get('max_iter')
  1255. ll_delta = kwargs.get('min_lldelta')
  1256. # Construct an encoding from the training data.
  1257. if not encoding:
  1258. encoding = TadmEventMaxentFeatureEncoding.train(
  1259. train_toks, count_cutoff, labels=labels
  1260. )
  1261. trainfile_fd, trainfile_name = tempfile.mkstemp(
  1262. prefix='nltk-tadm-events-', suffix='.gz'
  1263. )
  1264. weightfile_fd, weightfile_name = tempfile.mkstemp(prefix='nltk-tadm-weights-')
  1265. trainfile = gzip_open_unicode(trainfile_name, 'w')
  1266. write_tadm_file(train_toks, encoding, trainfile)
  1267. trainfile.close()
  1268. options = []
  1269. options.extend(['-monitor'])
  1270. options.extend(['-method', algorithm])
  1271. if sigma:
  1272. options.extend(['-l2', '%.6f' % sigma ** 2])
  1273. if max_iter:
  1274. options.extend(['-max_it', '%d' % max_iter])
  1275. if ll_delta:
  1276. options.extend(['-fatol', '%.6f' % abs(ll_delta)])
  1277. options.extend(['-events_in', trainfile_name])
  1278. options.extend(['-params_out', weightfile_name])
  1279. if trace < 3:
  1280. options.extend(['2>&1'])
  1281. else:
  1282. options.extend(['-summary'])
  1283. call_tadm(options)
  1284. with open(weightfile_name, 'r') as weightfile:
  1285. weights = parse_tadm_weights(weightfile)
  1286. os.remove(trainfile_name)
  1287. os.remove(weightfile_name)
  1288. # Convert from base-e to base-2 weights.
  1289. weights *= numpy.log2(numpy.e)
  1290. # Build the classifier
  1291. return cls(encoding, weights)
  1292. ######################################################################
  1293. # { Demo
  1294. ######################################################################
  1295. def demo():
  1296. from nltk.classify.util import names_demo
  1297. classifier = names_demo(MaxentClassifier.train)
  1298. if __name__ == '__main__':
  1299. demo()