123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: IBM Model 1
- #
- # Copyright (C) 2001-2013 NLTK Project
- # Author: Chin Yee Lee <c.lee32@student.unimelb.edu.au>
- # Hengfeng Li <hengfeng12345@gmail.com>
- # Ruxin Hou <r.hou@student.unimelb.edu.au>
- # Calvin Tanujaya Lim <c.tanujayalim@gmail.com>
- # Based on earlier version by:
- # Will Zhang <wilzzha@gmail.com>
- # Guan Gui <ggui@student.unimelb.edu.au>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Lexical translation model that ignores word order.
- In IBM Model 1, word order is ignored for simplicity. As long as the
- word alignments are equivalent, it doesn't matter where the word occurs
- in the source or target sentence. Thus, the following three alignments
- are equally likely.
- Source: je mange du jambon
- Target: i eat some ham
- Alignment: (0,0) (1,1) (2,2) (3,3)
- Source: je mange du jambon
- Target: some ham eat i
- Alignment: (0,2) (1,3) (2,1) (3,1)
- Source: du jambon je mange
- Target: eat i some ham
- Alignment: (0,3) (1,2) (2,0) (3,1)
- Note that an alignment is represented here as
- (word_index_in_target, word_index_in_source).
- The EM algorithm used in Model 1 is:
- E step - In the training data, count how many times a source language
- word is translated into a target language word, weighted by
- the prior probability of the translation.
- M step - Estimate the new probability of translation based on the
- counts from the Expectation step.
- Notations:
- i: Position in the source sentence
- Valid values are 0 (for NULL), 1, 2, ..., length of source sentence
- j: Position in the target sentence
- Valid values are 1, 2, ..., length of target sentence
- s: A word in the source language
- t: A word in the target language
- References:
- Philipp Koehn. 2010. Statistical Machine Translation.
- Cambridge University Press, New York.
- Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and
- Robert L. Mercer. 1993. The Mathematics of Statistical Machine
- Translation: Parameter Estimation. Computational Linguistics, 19 (2),
- 263-311.
- """
- from __future__ import division
- from collections import defaultdict
- from nltk.translate import AlignedSent
- from nltk.translate import Alignment
- from nltk.translate import IBMModel
- from nltk.translate.ibm_model import Counts
- import warnings
- class IBMModel1(IBMModel):
- """
- Lexical translation model that ignores word order
- >>> bitext = []
- >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small']))
- >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big']))
- >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small']))
- >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house']))
- >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book']))
- >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book']))
- >>> ibm1 = IBMModel1(bitext, 5)
- >>> print(ibm1.translation_table['buch']['book'])
- 0.889...
- >>> print(ibm1.translation_table['das']['book'])
- 0.061...
- >>> print(ibm1.translation_table['buch'][None])
- 0.113...
- >>> print(ibm1.translation_table['ja'][None])
- 0.072...
- >>> test_sentence = bitext[2]
- >>> test_sentence.words
- ['das', 'buch', 'ist', 'ja', 'klein']
- >>> test_sentence.mots
- ['the', 'book', 'is', 'small']
- >>> test_sentence.alignment
- Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)])
- """
- def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None):
- """
- Train on ``sentence_aligned_corpus`` and create a lexical
- translation model.
- Translation direction is from ``AlignedSent.mots`` to
- ``AlignedSent.words``.
- :param sentence_aligned_corpus: Sentence-aligned parallel corpus
- :type sentence_aligned_corpus: list(AlignedSent)
- :param iterations: Number of iterations to run training algorithm
- :type iterations: int
- :param probability_tables: Optional. Use this to pass in custom
- probability values. If not specified, probabilities will be
- set to a uniform distribution, or some other sensible value.
- If specified, the following entry must be present:
- ``translation_table``.
- See ``IBMModel`` for the type and purpose of this table.
- :type probability_tables: dict[str]: object
- """
- super(IBMModel1, self).__init__(sentence_aligned_corpus)
- if probability_tables is None:
- self.set_uniform_probabilities(sentence_aligned_corpus)
- else:
- # Set user-defined probabilities
- self.translation_table = probability_tables['translation_table']
- for n in range(0, iterations):
- self.train(sentence_aligned_corpus)
- self.align_all(sentence_aligned_corpus)
- def set_uniform_probabilities(self, sentence_aligned_corpus):
- initial_prob = 1 / len(self.trg_vocab)
- if initial_prob < IBMModel.MIN_PROB:
- warnings.warn(
- "Target language vocabulary is too large ("
- + str(len(self.trg_vocab))
- + " words). "
- "Results may be less accurate."
- )
- for t in self.trg_vocab:
- self.translation_table[t] = defaultdict(lambda: initial_prob)
- def train(self, parallel_corpus):
- counts = Counts()
- for aligned_sentence in parallel_corpus:
- trg_sentence = aligned_sentence.words
- src_sentence = [None] + aligned_sentence.mots
- # E step (a): Compute normalization factors to weigh counts
- total_count = self.prob_all_alignments(src_sentence, trg_sentence)
- # E step (b): Collect counts
- for t in trg_sentence:
- for s in src_sentence:
- count = self.prob_alignment_point(s, t)
- normalized_count = count / total_count[t]
- counts.t_given_s[t][s] += normalized_count
- counts.any_t_given_s[s] += normalized_count
- # M step: Update probabilities with maximum likelihood estimate
- self.maximize_lexical_translation_probabilities(counts)
- def prob_all_alignments(self, src_sentence, trg_sentence):
- """
- Computes the probability of all possible word alignments,
- expressed as a marginal distribution over target words t
- Each entry in the return value represents the contribution to
- the total alignment probability by the target word t.
- To obtain probability(alignment | src_sentence, trg_sentence),
- simply sum the entries in the return value.
- :return: Probability of t for all s in ``src_sentence``
- :rtype: dict(str): float
- """
- alignment_prob_for_t = defaultdict(lambda: 0.0)
- for t in trg_sentence:
- for s in src_sentence:
- alignment_prob_for_t[t] += self.prob_alignment_point(s, t)
- return alignment_prob_for_t
- def prob_alignment_point(self, s, t):
- """
- Probability that word ``t`` in the target sentence is aligned to
- word ``s`` in the source sentence
- """
- return self.translation_table[t][s]
- def prob_t_a_given_s(self, alignment_info):
- """
- Probability of target sentence and an alignment given the
- source sentence
- """
- prob = 1.0
- for j, i in enumerate(alignment_info.alignment):
- if j == 0:
- continue # skip the dummy zeroeth element
- trg_word = alignment_info.trg_sentence[j]
- src_word = alignment_info.src_sentence[i]
- prob *= self.translation_table[trg_word][src_word]
- return max(prob, IBMModel.MIN_PROB)
- def align_all(self, parallel_corpus):
- for sentence_pair in parallel_corpus:
- self.align(sentence_pair)
- def align(self, sentence_pair):
- """
- Determines the best word alignment for one sentence pair from
- the corpus that the model was trained on.
- The best alignment will be set in ``sentence_pair`` when the
- method returns. In contrast with the internal implementation of
- IBM models, the word indices in the ``Alignment`` are zero-
- indexed, not one-indexed.
- :param sentence_pair: A sentence in the source language and its
- counterpart sentence in the target language
- :type sentence_pair: AlignedSent
- """
- best_alignment = []
- for j, trg_word in enumerate(sentence_pair.words):
- # Initialize trg_word to align with the NULL token
- best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB)
- best_alignment_point = None
- for i, src_word in enumerate(sentence_pair.mots):
- align_prob = self.translation_table[trg_word][src_word]
- if align_prob >= best_prob: # prefer newer word in case of tie
- best_prob = align_prob
- best_alignment_point = i
- best_alignment.append((j, best_alignment_point))
- sentence_pair.alignment = Alignment(best_alignment)
|