ibm2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: IBM Model 2
  3. #
  4. # Copyright (C) 2001-2013 NLTK Project
  5. # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Lexical translation model that considers word order.
  10. IBM Model 2 improves on Model 1 by accounting for word order.
  11. An alignment probability is introduced, a(i | j,l,m), which predicts
  12. a source word position, given its aligned target word's position.
  13. The EM algorithm used in Model 2 is:
  14. E step - In the training data, collect counts, weighted by prior
  15. probabilities.
  16. (a) count how many times a source language word is translated
  17. into a target language word
  18. (b) count how many times a particular position in the source
  19. sentence is aligned to a particular position in the target
  20. sentence
  21. M step - Estimate new probabilities based on the counts from the E step
  22. Notations:
  23. i: Position in the source sentence
  24. Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
  25. j: Position in the target sentence
  26. Valid values are 1, 2, ..., length of target sentence
  27. l: Number of words in the source sentence, excluding NULL
  28. m: Number of words in the target sentence
  29. s: A word in the source language
  30. t: A word in the target language
  31. References:
  32. Philipp Koehn. 2010. Statistical Machine Translation.
  33. Cambridge University Press, New York.
  34. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
  35. Robert L. Mercer. 1993. The Mathematics of Statistical Machine
  36. Translation: Parameter Estimation. Computational Linguistics, 19 (2),
  37. 263-311.
  38. """
  39. from __future__ import division
  40. import warnings
  41. from collections import defaultdict
  42. from nltk.translate import AlignedSent
  43. from nltk.translate import Alignment
  44. from nltk.translate import IBMModel
  45. from nltk.translate import IBMModel1
  46. from nltk.translate.ibm_model import Counts
  47. class IBMModel2(IBMModel):
  48. """
  49. Lexical translation model that considers word order
  50. >>> bitext = []
  51. >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
  52. >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
  53. >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
  54. >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
  55. >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
  56. >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
  57. >>> ibm2 = IBMModel2(bitext, 5)
  58. >>> print(round(ibm2.translation_table['buch']['book'], 3))
  59. 1.0
  60. >>> print(round(ibm2.translation_table['das']['book'], 3))
  61. 0.0
  62. >>> print(round(ibm2.translation_table['buch'][None], 3))
  63. 0.0
  64. >>> print(round(ibm2.translation_table['ja'][None], 3))
  65. 0.0
  66. >>> print(ibm2.alignment_table[1][1][2][2])
  67. 0.938...
  68. >>> print(round(ibm2.alignment_table[1][2][2][2], 3))
  69. 0.0
  70. >>> print(round(ibm2.alignment_table[2][2][4][5], 3))
  71. 1.0
  72. >>> test_sentence = bitext[2]
  73. >>> test_sentence.words
  74. ['das', 'buch', 'ist', 'ja', 'klein']
  75. >>> test_sentence.mots
  76. ['the', 'book', 'is', 'small']
  77. >>> test_sentence.alignment
  78. Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
  79. """
  80. def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
  81. """
  82. Train on ``sentence_aligned_corpus`` and create a lexical
  83. translation model and an alignment model.
  84. Translation direction is from ``AlignedSent.mots`` to
  85. ``AlignedSent.words``.
  86. :param sentence_aligned_corpus: Sentence-aligned parallel corpus
  87. :type sentence_aligned_corpus: list(AlignedSent)
  88. :param iterations: Number of iterations to run training algorithm
  89. :type iterations: int
  90. :param probability_tables: Optional. Use this to pass in custom
  91. probability values. If not specified, probabilities will be
  92. set to a uniform distribution, or some other sensible value.
  93. If specified, all the following entries must be present:
  94. ``translation_table``, ``alignment_table``.
  95. See ``IBMModel`` for the type and purpose of these tables.
  96. :type probability_tables: dict[str]: object
  97. """
  98. super(IBMModel2, self).__init__(sentence_aligned_corpus)
  99. if probability_tables is None:
  100. # Get translation probabilities from IBM Model 1
  101. # Run more iterations of training for Model 1, since it is
  102. # faster than Model 2
  103. ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations)
  104. self.translation_table = ibm1.translation_table
  105. self.set_uniform_probabilities(sentence_aligned_corpus)
  106. else:
  107. # Set user-defined probabilities
  108. self.translation_table = probability_tables['translation_table']
  109. self.alignment_table = probability_tables['alignment_table']
  110. for n in range(0, iterations):
  111. self.train(sentence_aligned_corpus)
  112. self.align_all(sentence_aligned_corpus)
  113. def set_uniform_probabilities(self, sentence_aligned_corpus):
  114. # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m
  115. l_m_combinations = set()
  116. for aligned_sentence in sentence_aligned_corpus:
  117. l = len(aligned_sentence.mots)
  118. m = len(aligned_sentence.words)
  119. if (l, m) not in l_m_combinations:
  120. l_m_combinations.add((l, m))
  121. initial_prob = 1 / (l + 1)
  122. if initial_prob < IBMModel.MIN_PROB:
  123. warnings.warn(
  124. "A source sentence is too long ("
  125. + str(l)
  126. + " words). Results may be less accurate."
  127. )
  128. for i in range(0, l + 1):
  129. for j in range(1, m + 1):
  130. self.alignment_table[i][j][l][m] = initial_prob
  131. def train(self, parallel_corpus):
  132. counts = Model2Counts()
  133. for aligned_sentence in parallel_corpus:
  134. src_sentence = [None] + aligned_sentence.mots
  135. trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed
  136. l = len(aligned_sentence.mots)
  137. m = len(aligned_sentence.words)
  138. # E step (a): Compute normalization factors to weigh counts
  139. total_count = self.prob_all_alignments(src_sentence, trg_sentence)
  140. # E step (b): Collect counts
  141. for j in range(1, m + 1):
  142. t = trg_sentence[j]
  143. for i in range(0, l + 1):
  144. s = src_sentence[i]
  145. count = self.prob_alignment_point(i, j, src_sentence, trg_sentence)
  146. normalized_count = count / total_count[t]
  147. counts.update_lexical_translation(normalized_count, s, t)
  148. counts.update_alignment(normalized_count, i, j, l, m)
  149. # M step: Update probabilities with maximum likelihood estimates
  150. self.maximize_lexical_translation_probabilities(counts)
  151. self.maximize_alignment_probabilities(counts)
  152. def maximize_alignment_probabilities(self, counts):
  153. MIN_PROB = IBMModel.MIN_PROB
  154. for i, j_s in counts.alignment.items():
  155. for j, src_sentence_lengths in j_s.items():
  156. for l, trg_sentence_lengths in src_sentence_lengths.items():
  157. for m in trg_sentence_lengths:
  158. estimate = (
  159. counts.alignment[i][j][l][m]
  160. / counts.alignment_for_any_i[j][l][m]
  161. )
  162. self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB)
  163. def prob_all_alignments(self, src_sentence, trg_sentence):
  164. """
  165. Computes the probability of all possible word alignments,
  166. expressed as a marginal distribution over target words t
  167. Each entry in the return value represents the contribution to
  168. the total alignment probability by the target word t.
  169. To obtain probability(alignment | src_sentence, trg_sentence),
  170. simply sum the entries in the return value.
  171. :return: Probability of t for all s in ``src_sentence``
  172. :rtype: dict(str): float
  173. """
  174. alignment_prob_for_t = defaultdict(lambda: 0.0)
  175. for j in range(1, len(trg_sentence)):
  176. t = trg_sentence[j]
  177. for i in range(0, len(src_sentence)):
  178. alignment_prob_for_t[t] += self.prob_alignment_point(
  179. i, j, src_sentence, trg_sentence
  180. )
  181. return alignment_prob_for_t
  182. def prob_alignment_point(self, i, j, src_sentence, trg_sentence):
  183. """
  184. Probability that position j in ``trg_sentence`` is aligned to
  185. position i in the ``src_sentence``
  186. """
  187. l = len(src_sentence) - 1
  188. m = len(trg_sentence) - 1
  189. s = src_sentence[i]
  190. t = trg_sentence[j]
  191. return self.translation_table[t][s] * self.alignment_table[i][j][l][m]
  192. def prob_t_a_given_s(self, alignment_info):
  193. """
  194. Probability of target sentence and an alignment given the
  195. source sentence
  196. """
  197. prob = 1.0
  198. l = len(alignment_info.src_sentence) - 1
  199. m = len(alignment_info.trg_sentence) - 1
  200. for j, i in enumerate(alignment_info.alignment):
  201. if j == 0:
  202. continue # skip the dummy zeroeth element
  203. trg_word = alignment_info.trg_sentence[j]
  204. src_word = alignment_info.src_sentence[i]
  205. prob *= (
  206. self.translation_table[trg_word][src_word]
  207. * self.alignment_table[i][j][l][m]
  208. )
  209. return max(prob, IBMModel.MIN_PROB)
  210. def align_all(self, parallel_corpus):
  211. for sentence_pair in parallel_corpus:
  212. self.align(sentence_pair)
  213. def align(self, sentence_pair):
  214. """
  215. Determines the best word alignment for one sentence pair from
  216. the corpus that the model was trained on.
  217. The best alignment will be set in ``sentence_pair`` when the
  218. method returns. In contrast with the internal implementation of
  219. IBM models, the word indices in the ``Alignment`` are zero-
  220. indexed, not one-indexed.
  221. :param sentence_pair: A sentence in the source language and its
  222. counterpart sentence in the target language
  223. :type sentence_pair: AlignedSent
  224. """
  225. best_alignment = []
  226. l = len(sentence_pair.mots)
  227. m = len(sentence_pair.words)
  228. for j, trg_word in enumerate(sentence_pair.words):
  229. # Initialize trg_word to align with the NULL token
  230. best_prob = (
  231. self.translation_table[trg_word][None]
  232. * self.alignment_table[0][j + 1][l][m]
  233. )
  234. best_prob = max(best_prob, IBMModel.MIN_PROB)
  235. best_alignment_point = None
  236. for i, src_word in enumerate(sentence_pair.mots):
  237. align_prob = (
  238. self.translation_table[trg_word][src_word]
  239. * self.alignment_table[i + 1][j + 1][l][m]
  240. )
  241. if align_prob >= best_prob:
  242. best_prob = align_prob
  243. best_alignment_point = i
  244. best_alignment.append((j, best_alignment_point))
  245. sentence_pair.alignment = Alignment(best_alignment)
  246. class Model2Counts(Counts):
  247. """
  248. Data object to store counts of various parameters during training.
  249. Includes counts for alignment.
  250. """
  251. def __init__(self):
  252. super(Model2Counts, self).__init__()
  253. self.alignment = defaultdict(
  254. lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
  255. )
  256. self.alignment_for_any_i = defaultdict(
  257. lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
  258. )
  259. def update_lexical_translation(self, count, s, t):
  260. self.t_given_s[t][s] += count
  261. self.any_t_given_s[s] += count
  262. def update_alignment(self, count, i, j, l, m):
  263. self.alignment[i][j][l][m] += count
  264. self.alignment_for_any_i[j][l][m] += count