123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Language ID module using TextCat algorithm
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Avital Pekker <avital.pekker@utoronto.ca>
- #
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A module for language identification using the TextCat algorithm.
- An implementation of the text categorization algorithm
- presented in Cavnar, W. B. and J. M. Trenkle,
- "N-Gram-Based Text Categorization".
- The algorithm takes advantage of Zipf's law and uses
- n-gram frequencies to profile languages and text-yet to
- be identified-then compares using a distance measure.
- Language n-grams are provided by the "An Crubadan"
- project. A corpus reader was created separately to read
- those files.
- For details regarding the algorithm, see:
- http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
- For details about An Crubadan, see:
- http://borel.slu.edu/crubadan/index.html
- """
- # Ensure that literal strings default to unicode rather than str.
- from __future__ import print_function, unicode_literals
- from nltk.compat import PY3
- from nltk.util import trigrams
- if PY3:
- from sys import maxsize
- else:
- from sys import maxint
- # Note: this is NOT "re" you're likely used to. The regex module
- # is an alternative to the standard re module that supports
- # Unicode codepoint properties with the \p{} syntax.
- # You may have to "pip install regx"
- try:
- import regex as re
- except ImportError:
- re = None
- ######################################################################
- ## Language identification using TextCat
- ######################################################################
- class TextCat(object):
- _corpus = None
- fingerprints = {}
- _START_CHAR = "<"
- _END_CHAR = ">"
- last_distances = {}
- def __init__(self):
- if not re:
- raise EnvironmentError(
- "classify.textcat requires the regex module that "
- "supports unicode. Try '$ pip install regex' and "
- "see https://pypi.python.org/pypi/regex for "
- "further details."
- )
- from nltk.corpus import crubadan
- self._corpus = crubadan
- # Load all language ngrams into cache
- for lang in self._corpus.langs():
- self._corpus.lang_freq(lang)
- def remove_punctuation(self, text):
- ''' Get rid of punctuation except apostrophes '''
- return re.sub(r"[^\P{P}\']+", "", text)
- def profile(self, text):
- ''' Create FreqDist of trigrams within text '''
- from nltk import word_tokenize, FreqDist
- clean_text = self.remove_punctuation(text)
- tokens = word_tokenize(clean_text)
- fingerprint = FreqDist()
- for t in tokens:
- token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
- token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
- for cur_trigram in token_trigrams:
- if cur_trigram in fingerprint:
- fingerprint[cur_trigram] += 1
- else:
- fingerprint[cur_trigram] = 1
- return fingerprint
- def calc_dist(self, lang, trigram, text_profile):
- ''' Calculate the "out-of-place" measure between the
- text and language profile for a single trigram '''
- lang_fd = self._corpus.lang_freq(lang)
- dist = 0
- if trigram in lang_fd:
- idx_lang_profile = list(lang_fd.keys()).index(trigram)
- idx_text = list(text_profile.keys()).index(trigram)
- # print(idx_lang_profile, ", ", idx_text)
- dist = abs(idx_lang_profile - idx_text)
- else:
- # Arbitrary but should be larger than
- # any possible trigram file length
- # in terms of total lines
- if PY3:
- dist = maxsize
- else:
- dist = maxint
- return dist
- def lang_dists(self, text):
- ''' Calculate the "out-of-place" measure between
- the text and all languages '''
- distances = {}
- profile = self.profile(text)
- # For all the languages
- for lang in self._corpus._all_lang_freq.keys():
- # Calculate distance metric for every trigram in
- # input text to be identified
- lang_dist = 0
- for trigram in profile:
- lang_dist += self.calc_dist(lang, trigram, profile)
- distances[lang] = lang_dist
- return distances
- def guess_language(self, text):
- ''' Find the language with the min distance
- to the text and return its ISO 639-3 code '''
- self.last_distances = self.lang_dists(text)
- return min(self.last_distances, key=self.last_distances.get)
- #################################################')
- def demo():
- from nltk.corpus import udhr
- langs = [
- 'Kurdish-UTF8',
- 'Abkhaz-UTF8',
- 'Farsi_Persian-UTF8',
- 'Hindi-UTF8',
- 'Hawaiian-UTF8',
- 'Russian-UTF8',
- 'Vietnamese-UTF8',
- 'Serbian_Srpski-UTF8',
- 'Esperanto-UTF8',
- ]
- friendly = {
- 'kmr': 'Northern Kurdish',
- 'abk': 'Abkhazian',
- 'pes': 'Iranian Persian',
- 'hin': 'Hindi',
- 'haw': 'Hawaiian',
- 'rus': 'Russian',
- 'vie': 'Vietnamese',
- 'srp': 'Serbian',
- 'epo': 'Esperanto',
- }
- tc = TextCat()
- for cur_lang in langs:
- # Get raw data from UDHR corpus
- raw_sentences = udhr.sents(cur_lang)
- rows = len(raw_sentences) - 1
- cols = list(map(len, raw_sentences))
- sample = ''
- # Generate a sample text of the language
- for i in range(0, rows):
- cur_sent = ''
- for j in range(0, cols[i]):
- cur_sent += ' ' + raw_sentences[i][j]
- sample += cur_sent
- # Try to detect what it is
- print('Language snippet: ' + sample[0:140] + '...')
- guess = tc.guess_language(sample)
- print('Language detection: %s (%s)' % (guess, friendly[guess]))
- print('#' * 140)
- if __name__ == '__main__':
- demo()
|