ibm1.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: IBM Model 1
  3. #
  4. # Copyright (C) 2001-2013 NLTK Project
  5. # Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
  6. # Hengfeng Li <hengfeng12345@gmail.com>
  7. # Ruxin Hou <r.hou@student.unimelb.edu.au>
  8. # Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
  9. # Based on earlier version by:
  10. # Will Zhang <wilzzha@gmail.com>
  11. # Guan Gui <ggui@student.unimelb.edu.au>
  12. # URL: <http://nltk.org/>
  13. # For license information, see LICENSE.TXT
  14. """
  15. Lexical translation model that ignores word order.
  16. In IBM Model 1, word order is ignored for simplicity. As long as the
  17. word alignments are equivalent, it doesn't matter where the word occurs
  18. in the source or target sentence. Thus, the following three alignments
  19. are equally likely.
  20. Source: je mange du jambon
  21. Target: i eat some ham
  22. Alignment: (0,0) (1,1) (2,2) (3,3)
  23. Source: je mange du jambon
  24. Target: some ham eat i
  25. Alignment: (0,2) (1,3) (2,1) (3,1)
  26. Source: du jambon je mange
  27. Target: eat i some ham
  28. Alignment: (0,3) (1,2) (2,0) (3,1)
  29. Note that an alignment is represented here as
  30. (word_index_in_target, word_index_in_source).
  31. The EM algorithm used in Model 1 is:
  32. E step - In the training data, count how many times a source language
  33. word is translated into a target language word, weighted by
  34. the prior probability of the translation.
  35. M step - Estimate the new probability of translation based on the
  36. counts from the Expectation step.
  37. Notations:
  38. i: Position in the source sentence
  39. Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
  40. j: Position in the target sentence
  41. Valid values are 1, 2, ..., length of target sentence
  42. s: A word in the source language
  43. t: A word in the target language
  44. References:
  45. Philipp Koehn. 2010. Statistical Machine Translation.
  46. Cambridge University Press, New York.
  47. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
  48. Robert L. Mercer. 1993. The Mathematics of Statistical Machine
  49. Translation: Parameter Estimation. Computational Linguistics, 19 (2),
  50. 263-311.
  51. """
  52. from __future__ import division
  53. from collections import defaultdict
  54. from nltk.translate import AlignedSent
  55. from nltk.translate import Alignment
  56. from nltk.translate import IBMModel
  57. from nltk.translate.ibm_model import Counts
  58. import warnings
  59. class IBMModel1(IBMModel):
  60. """
  61. Lexical translation model that ignores word order
  62. >>> bitext = []
  63. >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
  64. >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
  65. >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
  66. >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
  67. >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
  68. >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
  69. >>> ibm1 = IBMModel1(bitext, 5)
  70. >>> print(ibm1.translation_table['buch']['book'])
  71. 0.889...
  72. >>> print(ibm1.translation_table['das']['book'])
  73. 0.061...
  74. >>> print(ibm1.translation_table['buch'][None])
  75. 0.113...
  76. >>> print(ibm1.translation_table['ja'][None])
  77. 0.072...
  78. >>> test_sentence = bitext[2]
  79. >>> test_sentence.words
  80. ['das', 'buch', 'ist', 'ja', 'klein']
  81. >>> test_sentence.mots
  82. ['the', 'book', 'is', 'small']
  83. >>> test_sentence.alignment
  84. Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
  85. """
  86. def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
  87. """
  88. Train on ``sentence_aligned_corpus`` and create a lexical
  89. translation model.
  90. Translation direction is from ``AlignedSent.mots`` to
  91. ``AlignedSent.words``.
  92. :param sentence_aligned_corpus: Sentence-aligned parallel corpus
  93. :type sentence_aligned_corpus: list(AlignedSent)
  94. :param iterations: Number of iterations to run training algorithm
  95. :type iterations: int
  96. :param probability_tables: Optional. Use this to pass in custom
  97. probability values. If not specified, probabilities will be
  98. set to a uniform distribution, or some other sensible value.
  99. If specified, the following entry must be present:
  100. ``translation_table``.
  101. See ``IBMModel`` for the type and purpose of this table.
  102. :type probability_tables: dict[str]: object
  103. """
  104. super(IBMModel1, self).__init__(sentence_aligned_corpus)
  105. if probability_tables is None:
  106. self.set_uniform_probabilities(sentence_aligned_corpus)
  107. else:
  108. # Set user-defined probabilities
  109. self.translation_table = probability_tables['translation_table']
  110. for n in range(0, iterations):
  111. self.train(sentence_aligned_corpus)
  112. self.align_all(sentence_aligned_corpus)
  113. def set_uniform_probabilities(self, sentence_aligned_corpus):
  114. initial_prob = 1 / len(self.trg_vocab)
  115. if initial_prob < IBMModel.MIN_PROB:
  116. warnings.warn(
  117. "Target language vocabulary is too large ("
  118. + str(len(self.trg_vocab))
  119. + " words). "
  120. "Results may be less accurate."
  121. )
  122. for t in self.trg_vocab:
  123. self.translation_table[t] = defaultdict(lambda: initial_prob)
  124. def train(self, parallel_corpus):
  125. counts = Counts()
  126. for aligned_sentence in parallel_corpus:
  127. trg_sentence = aligned_sentence.words
  128. src_sentence = [None] + aligned_sentence.mots
  129. # E step (a): Compute normalization factors to weigh counts
  130. total_count = self.prob_all_alignments(src_sentence, trg_sentence)
  131. # E step (b): Collect counts
  132. for t in trg_sentence:
  133. for s in src_sentence:
  134. count = self.prob_alignment_point(s, t)
  135. normalized_count = count / total_count[t]
  136. counts.t_given_s[t][s] += normalized_count
  137. counts.any_t_given_s[s] += normalized_count
  138. # M step: Update probabilities with maximum likelihood estimate
  139. self.maximize_lexical_translation_probabilities(counts)
  140. def prob_all_alignments(self, src_sentence, trg_sentence):
  141. """
  142. Computes the probability of all possible word alignments,
  143. expressed as a marginal distribution over target words t
  144. Each entry in the return value represents the contribution to
  145. the total alignment probability by the target word t.
  146. To obtain probability(alignment | src_sentence, trg_sentence),
  147. simply sum the entries in the return value.
  148. :return: Probability of t for all s in ``src_sentence``
  149. :rtype: dict(str): float
  150. """
  151. alignment_prob_for_t = defaultdict(lambda: 0.0)
  152. for t in trg_sentence:
  153. for s in src_sentence:
  154. alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
  155. return alignment_prob_for_t
  156. def prob_alignment_point(self, s, t):
  157. """
  158. Probability that word ``t`` in the target sentence is aligned to
  159. word ``s`` in the source sentence
  160. """
  161. return self.translation_table[t][s]
  162. def prob_t_a_given_s(self, alignment_info):
  163. """
  164. Probability of target sentence and an alignment given the
  165. source sentence
  166. """
  167. prob = 1.0
  168. for j, i in enumerate(alignment_info.alignment):
  169. if j == 0:
  170. continue # skip the dummy zeroeth element
  171. trg_word = alignment_info.trg_sentence[j]
  172. src_word = alignment_info.src_sentence[i]
  173. prob *= self.translation_table[trg_word][src_word]
  174. return max(prob, IBMModel.MIN_PROB)
  175. def align_all(self, parallel_corpus):
  176. for sentence_pair in parallel_corpus:
  177. self.align(sentence_pair)
  178. def align(self, sentence_pair):
  179. """
  180. Determines the best word alignment for one sentence pair from
  181. the corpus that the model was trained on.
  182. The best alignment will be set in ``sentence_pair`` when the
  183. method returns. In contrast with the internal implementation of
  184. IBM models, the word indices in the ``Alignment`` are zero-
  185. indexed, not one-indexed.
  186. :param sentence_pair: A sentence in the source language and its
  187. counterpart sentence in the target language
  188. :type sentence_pair: AlignedSent
  189. """
  190. best_alignment = []
  191. for j, trg_word in enumerate(sentence_pair.words):
  192. # Initialize trg_word to align with the NULL token
  193. best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
  194. best_alignment_point = None
  195. for i, src_word in enumerate(sentence_pair.mots):
  196. align_prob = self.translation_table[trg_word][src_word]
  197. if align_prob >= best_prob: # prefer newer word in case of tie
  198. best_prob = align_prob
  199. best_alignment_point = i
  200. best_alignment.append((j, best_alignment_point))
  201. sentence_pair.alignment = Alignment(best_alignment)