textcat.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Language ID module using TextCat algorithm
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Avital Pekker <avital.pekker@utoronto.ca>
  6. #
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. A module for language identification using the TextCat algorithm.
  11. An implementation of the text categorization algorithm
  12. presented in Cavnar, W. B. and J. M. Trenkle,
  13. "N-Gram-Based Text Categorization".
  14. The algorithm takes advantage of Zipf's law and uses
  15. n-gram frequencies to profile languages and text-yet to
  16. be identified-then compares using a distance measure.
  17. Language n-grams are provided by the "An Crubadan"
  18. project. A corpus reader was created separately to read
  19. those files.
  20. For details regarding the algorithm, see:
  21. http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf
  22. For details about An Crubadan, see:
  23. http://borel.slu.edu/crubadan/index.html
  24. """
  25. # Ensure that literal strings default to unicode rather than str.
  26. from __future__ import print_function, unicode_literals
  27. from nltk.compat import PY3
  28. from nltk.util import trigrams
  29. if PY3:
  30. from sys import maxsize
  31. else:
  32. from sys import maxint
  33. # Note: this is NOT "re" you're likely used to. The regex module
  34. # is an alternative to the standard re module that supports
  35. # Unicode codepoint properties with the \p{} syntax.
  36. # You may have to "pip install regx"
  37. try:
  38. import regex as re
  39. except ImportError:
  40. re = None
  41. ######################################################################
  42. ## Language identification using TextCat
  43. ######################################################################
  44. class TextCat(object):
  45. _corpus = None
  46. fingerprints = {}
  47. _START_CHAR = "<"
  48. _END_CHAR = ">"
  49. last_distances = {}
  50. def __init__(self):
  51. if not re:
  52. raise EnvironmentError(
  53. "classify.textcat requires the regex module that "
  54. "supports unicode. Try '$ pip install regex' and "
  55. "see https://pypi.python.org/pypi/regex for "
  56. "further details."
  57. )
  58. from nltk.corpus import crubadan
  59. self._corpus = crubadan
  60. # Load all language ngrams into cache
  61. for lang in self._corpus.langs():
  62. self._corpus.lang_freq(lang)
  63. def remove_punctuation(self, text):
  64. ''' Get rid of punctuation except apostrophes '''
  65. return re.sub(r"[^\P{P}\']+", "", text)
  66. def profile(self, text):
  67. ''' Create FreqDist of trigrams within text '''
  68. from nltk import word_tokenize, FreqDist
  69. clean_text = self.remove_punctuation(text)
  70. tokens = word_tokenize(clean_text)
  71. fingerprint = FreqDist()
  72. for t in tokens:
  73. token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
  74. token_trigrams = [''.join(tri) for tri in token_trigram_tuples]
  75. for cur_trigram in token_trigrams:
  76. if cur_trigram in fingerprint:
  77. fingerprint[cur_trigram] += 1
  78. else:
  79. fingerprint[cur_trigram] = 1
  80. return fingerprint
  81. def calc_dist(self, lang, trigram, text_profile):
  82. ''' Calculate the "out-of-place" measure between the
  83. text and language profile for a single trigram '''
  84. lang_fd = self._corpus.lang_freq(lang)
  85. dist = 0
  86. if trigram in lang_fd:
  87. idx_lang_profile = list(lang_fd.keys()).index(trigram)
  88. idx_text = list(text_profile.keys()).index(trigram)
  89. # print(idx_lang_profile, ", ", idx_text)
  90. dist = abs(idx_lang_profile - idx_text)
  91. else:
  92. # Arbitrary but should be larger than
  93. # any possible trigram file length
  94. # in terms of total lines
  95. if PY3:
  96. dist = maxsize
  97. else:
  98. dist = maxint
  99. return dist
  100. def lang_dists(self, text):
  101. ''' Calculate the "out-of-place" measure between
  102. the text and all languages '''
  103. distances = {}
  104. profile = self.profile(text)
  105. # For all the languages
  106. for lang in self._corpus._all_lang_freq.keys():
  107. # Calculate distance metric for every trigram in
  108. # input text to be identified
  109. lang_dist = 0
  110. for trigram in profile:
  111. lang_dist += self.calc_dist(lang, trigram, profile)
  112. distances[lang] = lang_dist
  113. return distances
  114. def guess_language(self, text):
  115. ''' Find the language with the min distance
  116. to the text and return its ISO 639-3 code '''
  117. self.last_distances = self.lang_dists(text)
  118. return min(self.last_distances, key=self.last_distances.get)
  119. #################################################')
  120. def demo():
  121. from nltk.corpus import udhr
  122. langs = [
  123. 'Kurdish-UTF8',
  124. 'Abkhaz-UTF8',
  125. 'Farsi_Persian-UTF8',
  126. 'Hindi-UTF8',
  127. 'Hawaiian-UTF8',
  128. 'Russian-UTF8',
  129. 'Vietnamese-UTF8',
  130. 'Serbian_Srpski-UTF8',
  131. 'Esperanto-UTF8',
  132. ]
  133. friendly = {
  134. 'kmr': 'Northern Kurdish',
  135. 'abk': 'Abkhazian',
  136. 'pes': 'Iranian Persian',
  137. 'hin': 'Hindi',
  138. 'haw': 'Hawaiian',
  139. 'rus': 'Russian',
  140. 'vie': 'Vietnamese',
  141. 'srp': 'Serbian',
  142. 'epo': 'Esperanto',
  143. }
  144. tc = TextCat()
  145. for cur_lang in langs:
  146. # Get raw data from UDHR corpus
  147. raw_sentences = udhr.sents(cur_lang)
  148. rows = len(raw_sentences) - 1
  149. cols = list(map(len, raw_sentences))
  150. sample = ''
  151. # Generate a sample text of the language
  152. for i in range(0, rows):
  153. cur_sent = ''
  154. for j in range(0, cols[i]):
  155. cur_sent += ' ' + raw_sentences[i][j]
  156. sample += cur_sent
  157. # Try to detect what it is
  158. print('Language snippet: ' + sample[0:140] + '...')
  159. guess = tc.guess_language(sample)
  160. print('Language detection: %s (%s)' % (guess, friendly[guess]))
  161. print('#' * 140)
  162. if __name__ == '__main__':
  163. demo()