123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: IBM Model 3
- #
- # Copyright (C) 2001-2013 NLTK Project
- # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Translation model that considers how a word can be aligned to
- multiple words in another language.
- IBM Model 3 improves on Model 2 by directly modeling the phenomenon
- where a word in one language may be translated into zero or more words
- in another. This is expressed by the fertility probability,
- n(phi | source word).
- If a source word translates into more than one word, it is possible to
- generate sentences that have the same alignment in multiple ways. This
- is modeled by a distortion step. The distortion probability, d(j|i,l,m),
- predicts a target word position, given its aligned source word's
- position. The distortion probability replaces the alignment probability
- of Model 2.
- The fertility probability is not applicable for NULL. Target words that
- align to NULL are assumed to be distributed uniformly in the target
- sentence. The existence of these words is modeled by p1, the probability
- that a target word produced by a real source word requires another
- target word that is produced by NULL.
- The EM algorithm used in Model 3 is:
- E step - In the training data, collect counts, weighted by prior
- probabilities.
- (a) count how many times a source language word is translated
- into a target language word
- (b) count how many times a particular position in the target
- sentence is aligned to a particular position in the source
- sentence
- (c) count how many times a source word is aligned to phi number
- of target words
- (d) count how many times NULL is aligned to a target word
- M step - Estimate new probabilities based on the counts from the E step
- Because there are too many possible alignments, only the most probable
- ones are considered. First, the best alignment is determined using prior
- probabilities. Then, a hill climbing approach is used to find other good
- candidates.
- Notations:
- i: Position in the source sentence
- Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
- j: Position in the target sentence
- Valid values are 1, 2, ..., length of target sentence
- l: Number of words in the source sentence, excluding NULL
- m: Number of words in the target sentence
- s: A word in the source language
- t: A word in the target language
- phi: Fertility, the number of target words produced by a source word
- p1: Probability that a target word produced by a source word is
- accompanied by another target word that is aligned to NULL
- p0: 1 - p1
- References:
- Philipp Koehn. 2010. Statistical Machine Translation.
- Cambridge University Press, New York.
- Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
- Robert L. Mercer. 1993. The Mathematics of Statistical Machine
- Translation: Parameter Estimation. Computational Linguistics, 19 (2),
- 263-311.
- """
- from __future__ import division
- import warnings
- from collections import defaultdict
- from math import factorial
- from nltk.translate import AlignedSent
- from nltk.translate import Alignment
- from nltk.translate import IBMModel
- from nltk.translate import IBMModel2
- from nltk.translate.ibm_model import Counts
- class IBMModel3(IBMModel):
- """
- Translation model that considers how a word can be aligned to
- multiple words in another language
- >>> bitext = []
- >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
- >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big']))
- >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
- >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small']))
- >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
- >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
- >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
- >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book']))
- >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize']))
- >>> ibm3 = IBMModel3(bitext, 5)
- >>> print(round(ibm3.translation_table['buch']['book'], 3))
- 1.0
- >>> print(round(ibm3.translation_table['das']['book'], 3))
- 0.0
- >>> print(round(ibm3.translation_table['ja'][None], 3))
- 1.0
- >>> print(round(ibm3.distortion_table[1][1][2][2], 3))
- 1.0
- >>> print(round(ibm3.distortion_table[1][2][2][2], 3))
- 0.0
- >>> print(round(ibm3.distortion_table[2][2][4][5], 3))
- 0.75
- >>> print(round(ibm3.fertility_table[2]['summarize'], 3))
- 1.0
- >>> print(round(ibm3.fertility_table[1]['book'], 3))
- 1.0
- >>> print(ibm3.p1)
- 0.054...
- >>> test_sentence = bitext[2]
- >>> test_sentence.words
- ['das', 'buch', 'ist', 'ja', 'klein']
- >>> test_sentence.mots
- ['the', 'book', 'is', 'small']
- >>> test_sentence.alignment
- Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)])
- """
- def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
- """
- Train on ``sentence_aligned_corpus`` and create a lexical
- translation model, a distortion model, a fertility model, and a
- model for generating NULL-aligned words.
- Translation direction is from ``AlignedSent.mots`` to
- ``AlignedSent.words``.
- :param sentence_aligned_corpus: Sentence-aligned parallel corpus
- :type sentence_aligned_corpus: list(AlignedSent)
- :param iterations: Number of iterations to run training algorithm
- :type iterations: int
- :param probability_tables: Optional. Use this to pass in custom
- probability values. If not specified, probabilities will be
- set to a uniform distribution, or some other sensible value.
- If specified, all the following entries must be present:
- ``translation_table``, ``alignment_table``,
- ``fertility_table``, ``p1``, ``distortion_table``.
- See ``IBMModel`` for the type and purpose of these tables.
- :type probability_tables: dict[str]: object
- """
- super(IBMModel3, self).__init__(sentence_aligned_corpus)
- self.reset_probabilities()
- if probability_tables is None:
- # Get translation and alignment probabilities from IBM Model 2
- ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
- self.translation_table = ibm2.translation_table
- self.alignment_table = ibm2.alignment_table
- self.set_uniform_probabilities(sentence_aligned_corpus)
- else:
- # Set user-defined probabilities
- self.translation_table = probability_tables['translation_table']
- self.alignment_table = probability_tables['alignment_table']
- self.fertility_table = probability_tables['fertility_table']
- self.p1 = probability_tables['p1']
- self.distortion_table = probability_tables['distortion_table']
- for n in range(0, iterations):
- self.train(sentence_aligned_corpus)
- def reset_probabilities(self):
- super(IBMModel3, self).reset_probabilities()
- self.distortion_table = defaultdict(
- lambda: defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))
- )
- )
- """
- dict[int][int][int][int]: float. Probability(j | i,l,m).
- Values accessed as ``distortion_table[j][i][l][m]``.
- """
- def set_uniform_probabilities(self, sentence_aligned_corpus):
- # d(j | i,l,m) = 1 / m for all i, j, l, m
- l_m_combinations = set()
- for aligned_sentence in sentence_aligned_corpus:
- l = len(aligned_sentence.mots)
- m = len(aligned_sentence.words)
- if (l, m) not in l_m_combinations:
- l_m_combinations.add((l, m))
- initial_prob = 1 / m
- if initial_prob < IBMModel.MIN_PROB:
- warnings.warn(
- "A target sentence is too long ("
- + str(m)
- + " words). Results may be less accurate."
- )
- for j in range(1, m + 1):
- for i in range(0, l + 1):
- self.distortion_table[j][i][l][m] = initial_prob
- # simple initialization, taken from GIZA++
- self.fertility_table[0] = defaultdict(lambda: 0.2)
- self.fertility_table[1] = defaultdict(lambda: 0.65)
- self.fertility_table[2] = defaultdict(lambda: 0.1)
- self.fertility_table[3] = defaultdict(lambda: 0.04)
- MAX_FERTILITY = 10
- initial_fert_prob = 0.01 / (MAX_FERTILITY - 4)
- for phi in range(4, MAX_FERTILITY):
- self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob)
- self.p1 = 0.5
- def train(self, parallel_corpus):
- counts = Model3Counts()
- for aligned_sentence in parallel_corpus:
- l = len(aligned_sentence.mots)
- m = len(aligned_sentence.words)
- # Sample the alignment space
- sampled_alignments, best_alignment = self.sample(aligned_sentence)
- # Record the most probable alignment
- aligned_sentence.alignment = Alignment(
- best_alignment.zero_indexed_alignment()
- )
- # E step (a): Compute normalization factors to weigh counts
- total_count = self.prob_of_alignments(sampled_alignments)
- # E step (b): Collect counts
- for alignment_info in sampled_alignments:
- count = self.prob_t_a_given_s(alignment_info)
- normalized_count = count / total_count
- for j in range(1, m + 1):
- counts.update_lexical_translation(
- normalized_count, alignment_info, j
- )
- counts.update_distortion(normalized_count, alignment_info, j, l, m)
- counts.update_null_generation(normalized_count, alignment_info)
- counts.update_fertility(normalized_count, alignment_info)
- # M step: Update probabilities with maximum likelihood estimates
- # If any probability is less than MIN_PROB, clamp it to MIN_PROB
- existing_alignment_table = self.alignment_table
- self.reset_probabilities()
- self.alignment_table = existing_alignment_table # don't retrain
- self.maximize_lexical_translation_probabilities(counts)
- self.maximize_distortion_probabilities(counts)
- self.maximize_fertility_probabilities(counts)
- self.maximize_null_generation_probabilities(counts)
- def maximize_distortion_probabilities(self, counts):
- MIN_PROB = IBMModel.MIN_PROB
- for j, i_s in counts.distortion.items():
- for i, src_sentence_lengths in i_s.items():
- for l, trg_sentence_lengths in src_sentence_lengths.items():
- for m in trg_sentence_lengths:
- estimate = (
- counts.distortion[j][i][l][m]
- / counts.distortion_for_any_j[i][l][m]
- )
- self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB)
- def prob_t_a_given_s(self, alignment_info):
- """
- Probability of target sentence and an alignment given the
- source sentence
- """
- src_sentence = alignment_info.src_sentence
- trg_sentence = alignment_info.trg_sentence
- l = len(src_sentence) - 1 # exclude NULL
- m = len(trg_sentence) - 1
- p1 = self.p1
- p0 = 1 - p1
- probability = 1.0
- MIN_PROB = IBMModel.MIN_PROB
- # Combine NULL insertion probability
- null_fertility = alignment_info.fertility_of_i(0)
- probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)
- if probability < MIN_PROB:
- return MIN_PROB
- # Compute combination (m - null_fertility) choose null_fertility
- for i in range(1, null_fertility + 1):
- probability *= (m - null_fertility - i + 1) / i
- if probability < MIN_PROB:
- return MIN_PROB
- # Combine fertility probabilities
- for i in range(1, l + 1):
- fertility = alignment_info.fertility_of_i(i)
- probability *= (
- factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]
- )
- if probability < MIN_PROB:
- return MIN_PROB
- # Combine lexical and distortion probabilities
- for j in range(1, m + 1):
- t = trg_sentence[j]
- i = alignment_info.alignment[j]
- s = src_sentence[i]
- probability *= (
- self.translation_table[t][s] * self.distortion_table[j][i][l][m]
- )
- if probability < MIN_PROB:
- return MIN_PROB
- return probability
- class Model3Counts(Counts):
- """
- Data object to store counts of various parameters during training.
- Includes counts for distortion.
- """
- def __init__(self):
- super(Model3Counts, self).__init__()
- self.distortion = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))
- )
- self.distortion_for_any_j = defaultdict(
- lambda: defaultdict(lambda: defaultdict(lambda: 0.0))
- )
- def update_distortion(self, count, alignment_info, j, l, m):
- i = alignment_info.alignment[j]
- self.distortion[j][i][l][m] += count
- self.distortion_for_any_j[i][l][m] += count
|