models.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Language Models
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """Language Models"""
  9. from __future__ import division, unicode_literals
  10. from nltk import compat
  11. from nltk.lm.api import LanguageModel, Smoothing
  12. from nltk.lm.smoothing import KneserNey, WittenBell
  13. @compat.python_2_unicode_compatible
  14. class MLE(LanguageModel):
  15. """Class for providing MLE ngram model scores.
  16. Inherits initialization from BaseNgramModel.
  17. """
  18. def unmasked_score(self, word, context=None):
  19. """Returns the MLE score for a word given a context.
  20. Args:
  21. - word is expcected to be a string
  22. - context is expected to be something reasonably convertible to a tuple
  23. """
  24. return self.context_counts(context).freq(word)
  25. @compat.python_2_unicode_compatible
  26. class Lidstone(LanguageModel):
  27. """Provides Lidstone-smoothed scores.
  28. In addition to initialization arguments from BaseNgramModel also requires
  29. a number by which to increase the counts, gamma.
  30. """
  31. def __init__(self, gamma, *args, **kwargs):
  32. super(Lidstone, self).__init__(*args, **kwargs)
  33. self.gamma = gamma
  34. def unmasked_score(self, word, context=None):
  35. """Add-one smoothing: Lidstone or Laplace.
  36. To see what kind, look at `gamma` attribute on the class.
  37. """
  38. counts = self.context_counts(context)
  39. word_count = counts[word]
  40. norm_count = counts.N()
  41. return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma)
  42. @compat.python_2_unicode_compatible
  43. class Laplace(Lidstone):
  44. """Implements Laplace (add one) smoothing.
  45. Initialization identical to BaseNgramModel because gamma is always 1.
  46. """
  47. def __init__(self, *args, **kwargs):
  48. super(Laplace, self).__init__(1, *args, **kwargs)
  49. class InterpolatedLanguageModel(LanguageModel):
  50. """Logic common to all interpolated language models.
  51. The idea to abstract this comes from Chen & Goodman 1995.
  52. """
  53. def __init__(self, smoothing_cls, order, **kwargs):
  54. assert issubclass(smoothing_cls, Smoothing)
  55. params = kwargs.pop("params", {})
  56. super(InterpolatedLanguageModel, self).__init__(order, **kwargs)
  57. self.estimator = smoothing_cls(self.vocab, self.counts, **params)
  58. def unmasked_score(self, word, context=None):
  59. if not context:
  60. return self.estimator.unigram_score(word)
  61. alpha, gamma = self.estimator.alpha_gamma(word, context)
  62. return alpha + gamma * self.unmasked_score(word, context[1:])
  63. class WittenBellInterpolated(InterpolatedLanguageModel):
  64. """Interpolated version of Witten-Bell smoothing."""
  65. def __init__(self, order, **kwargs):
  66. super(WittenBellInterpolated, self).__init__(WittenBell, order, **kwargs)
  67. class KneserNeyInterpolated(InterpolatedLanguageModel):
  68. """Interpolated version of Kneser-Ney smoothing."""
  69. def __init__(self, order, discount=0.1, **kwargs):
  70. super(KneserNeyInterpolated, self).__init__(
  71. KneserNey, order, params={"discount": discount}, **kwargs
  72. )