toktok.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
  3. #
  4. # Copyright (C) 2001-2015 NLTK Project
  5. # Author: Jon Dehdari
  6. # Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters
  7. #
  8. # URL: <http://nltk.sourceforge.net>
  9. # For license information, see LICENSE.TXT
  10. """
  11. The tok-tok tokenizer is a simple, general tokenizer, where the input has one
  12. sentence per line; thus only final period is tokenized.
  13. Tok-tok has been tested on, and gives reasonably good results for English,
  14. Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
  15. The input should be in UTF-8 encoding.
  16. Reference:
  17. Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
  18. Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
  19. """
  20. import re
  21. from six import text_type
  22. from nltk.tokenize.api import TokenizerI
  23. class ToktokTokenizer(TokenizerI):
  24. """
  25. This is a Python port of the tok-tok.pl from
  26. https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
  27. >>> toktok = ToktokTokenizer()
  28. >>> text = u'Is 9.5 or 525,600 my favorite number?'
  29. >>> print (toktok.tokenize(text, return_str=True))
  30. Is 9.5 or 525,600 my favorite number ?
  31. >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
  32. >>> print (toktok.tokenize(text, return_str=True))
  33. The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
  34. >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
  35. >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
  36. >>> assert toktok.tokenize(text, return_str=True) == expected
  37. >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
  38. True
  39. """
  40. # Replace non-breaking spaces with normal spaces.
  41. NON_BREAKING = re.compile(u"\u00A0"), " "
  42. # Pad some funky punctuation.
  43. FUNKY_PUNCT_1 = re.compile(u'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
  44. # Pad more funky punctuation.
  45. FUNKY_PUNCT_2 = re.compile(u'([({\[“‘„‚«‹「『])'), r" \1 "
  46. # Pad En dash and em dash
  47. EN_EM_DASHES = re.compile(u'([–—])'), r" \1 "
  48. # Replace problematic character with numeric character reference.
  49. AMPERCENT = re.compile('& '), '&amp; '
  50. TAB = re.compile('\t'), ' &#9; '
  51. PIPE = re.compile('\|'), ' &#124; '
  52. # Pad numbers with commas to keep them from further tokenization.
  53. COMMA_IN_NUM = re.compile(r'(?<!,)([,،])(?![,\d])'), r' \1 '
  54. # Just pad problematic (often neurotic) hyphen/single quote, etc.
  55. PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r' \1 '
  56. # Group ` ` stupid quotes ' ' into a single token.
  57. STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
  58. STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
  59. # Don't tokenize period unless it ends the line and that it isn't
  60. # preceded by another period, e.g.
  61. # "something ..." -> "something ..."
  62. # "something." -> "something ."
  63. FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
  64. # Don't tokenize period unless it ends the line eg.
  65. # " ... stuff." -> "... stuff ."
  66. FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
  67. # Treat continuous commas as fake German,Czech, etc.: „
  68. MULTI_COMMAS = re.compile(r'(,{2,})'), r' \1 '
  69. # Treat continuous dashes as fake en-dash, etc.
  70. MULTI_DASHES = re.compile(r'(-{2,})'), r' \1 '
  71. # Treat multiple periods as a thing (eg. ellipsis)
  72. MULTI_DOTS = re.compile(r'(\.{2,})'), r' \1 '
  73. # This is the \p{Open_Punctuation} from Perl's perluniprops
  74. # see http://perldoc.perl.org/perluniprops.html
  75. OPEN_PUNCT = text_type(
  76. u'([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d'
  77. u'\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772'
  78. u'\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983'
  79. u'\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993'
  80. u'\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26'
  81. u'\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016'
  82. u'\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39'
  83. u'\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b'
  84. u'\ufe5d\uff08\uff3b\uff5b\uff5f\uff62'
  85. )
  86. # This is the \p{Close_Punctuation} from Perl's perluniprops
  87. CLOSE_PUNCT = text_type(
  88. u')]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a'
  89. u'\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6'
  90. u'\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988'
  91. u'\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998'
  92. u'\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009'
  93. u'\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b'
  94. u'\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c'
  95. u'\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e'
  96. u'\uff09\uff3d\uff5d\uff60\uff63'
  97. )
  98. # This is the \p{Close_Punctuation} from Perl's perluniprops
  99. CURRENCY_SYM = text_type(
  100. u'$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb'
  101. u'\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3'
  102. u'\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab'
  103. u'\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3'
  104. u'\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838'
  105. u'\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6'
  106. )
  107. # Pad spaces after opening punctuations.
  108. OPEN_PUNCT_RE = re.compile(u'([{}])'.format(OPEN_PUNCT)), r'\1 '
  109. # Pad spaces before closing punctuations.
  110. CLOSE_PUNCT_RE = re.compile(u'([{}])'.format(CLOSE_PUNCT)), r'\1 '
  111. # Pad spaces after currency symbols.
  112. CURRENCY_SYM_RE = re.compile(u'([{}])'.format(CURRENCY_SYM)), r'\1 '
  113. # Use for tokenizing URL-unfriendly characters: [:/?#]
  114. URL_FOE_1 = re.compile(r':(?!//)'), r' : ' # in perl s{:(?!//)}{ : }g;
  115. URL_FOE_2 = re.compile(r'\?(?!\S)'), r' ? ' # in perl s{\?(?!\S)}{ ? }g;
  116. # in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
  117. URL_FOE_3 = re.compile(r'(:\/\/)[\S+\.\S+\/\S+][\/]'), ' / '
  118. URL_FOE_4 = re.compile(r' /'), r' / ' # s{ /}{ / }g;
  119. # Left/Right strip, i.e. remove heading/trailing spaces.
  120. # These strip regexes should NOT be used,
  121. # instead use str.lstrip(), str.rstrip() or str.strip()
  122. # (They are kept for reference purposes to the original toktok.pl code)
  123. LSTRIP = re.compile(r'^ +'), ''
  124. RSTRIP = re.compile(r'\s+$'), '\n'
  125. # Merge multiple spaces.
  126. ONE_SPACE = re.compile(r' {2,}'), ' '
  127. TOKTOK_REGEXES = [
  128. NON_BREAKING,
  129. FUNKY_PUNCT_1,
  130. URL_FOE_1,
  131. URL_FOE_2,
  132. URL_FOE_3,
  133. URL_FOE_4,
  134. AMPERCENT,
  135. TAB,
  136. PIPE,
  137. OPEN_PUNCT_RE,
  138. CLOSE_PUNCT_RE,
  139. MULTI_COMMAS,
  140. COMMA_IN_NUM,
  141. FINAL_PERIOD_2,
  142. PROB_SINGLE_QUOTES,
  143. STUPID_QUOTES_1,
  144. STUPID_QUOTES_2,
  145. CURRENCY_SYM_RE,
  146. EN_EM_DASHES,
  147. MULTI_DASHES,
  148. MULTI_DOTS,
  149. FINAL_PERIOD_1,
  150. FINAL_PERIOD_2,
  151. ONE_SPACE,
  152. ]
  153. def tokenize(self, text, return_str=False):
  154. text = text_type(text) # Converts input string into unicode.
  155. for regexp, subsitution in self.TOKTOK_REGEXES:
  156. text = regexp.sub(subsitution, text)
  157. # Finally, strips heading and trailing spaces
  158. # and converts output string into unicode.
  159. text = text_type(text.strip())
  160. return text if return_str else text.split()