util.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Tokenizer Utilities
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.sourceforge.net>
  7. # For license information, see LICENSE.TXT
  8. from re import finditer
  9. from xml.sax.saxutils import escape, unescape
  10. def string_span_tokenize(s, sep):
  11. r"""
  12. Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
  13. tuples, by splitting the string at each occurrence of *sep*.
  14. >>> from nltk.tokenize.util import string_span_tokenize
  15. >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
  16. ... two of them.\n\nThanks.'''
  17. >>> list(string_span_tokenize(s, " "))
  18. [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
  19. (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
  20. :param s: the string to be tokenized
  21. :type s: str
  22. :param sep: the token separator
  23. :type sep: str
  24. :rtype: iter(tuple(int, int))
  25. """
  26. if len(sep) == 0:
  27. raise ValueError("Token delimiter must not be empty")
  28. left = 0
  29. while True:
  30. try:
  31. right = s.index(sep, left)
  32. if right != 0:
  33. yield left, right
  34. except ValueError:
  35. if left != len(s):
  36. yield left, len(s)
  37. break
  38. left = right + len(sep)
  39. def regexp_span_tokenize(s, regexp):
  40. r"""
  41. Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
  42. tuples, by splitting the string at each successive match of *regexp*.
  43. >>> from nltk.tokenize.util import regexp_span_tokenize
  44. >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
  45. ... two of them.\n\nThanks.'''
  46. >>> list(regexp_span_tokenize(s, r'\s'))
  47. [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
  48. (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
  49. :param s: the string to be tokenized
  50. :type s: str
  51. :param regexp: regular expression that matches token separators (must not be empty)
  52. :type regexp: str
  53. :rtype: iter(tuple(int, int))
  54. """
  55. left = 0
  56. for m in finditer(regexp, s):
  57. right, next = m.span()
  58. if right != left:
  59. yield left, right
  60. left = next
  61. yield left, len(s)
  62. def spans_to_relative(spans):
  63. r"""
  64. Return a sequence of relative spans, given a sequence of spans.
  65. >>> from nltk.tokenize import WhitespaceTokenizer
  66. >>> from nltk.tokenize.util import spans_to_relative
  67. >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
  68. ... two of them.\n\nThanks.'''
  69. >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s)))
  70. [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
  71. (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
  72. :param spans: a sequence of (start, end) offsets of the tokens
  73. :type spans: iter(tuple(int, int))
  74. :rtype: iter(tuple(int, int))
  75. """
  76. prev = 0
  77. for left, right in spans:
  78. yield left - prev, right - left
  79. prev = right
  80. class CJKChars(object):
  81. """
  82. An object that enumerates the code points of the CJK characters as listed on
  83. http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
  84. This is a Python port of the CJK code point enumerations of Moses tokenizer:
  85. https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
  86. """
  87. # Hangul Jamo (1100–11FF)
  88. Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
  89. # CJK Radicals Supplement (2E80–2EFF)
  90. # Kangxi Radicals (2F00–2FDF)
  91. # Ideographic Description Characters (2FF0–2FFF)
  92. # CJK Symbols and Punctuation (3000–303F)
  93. # Hiragana (3040–309F)
  94. # Katakana (30A0–30FF)
  95. # Bopomofo (3100–312F)
  96. # Hangul Compatibility Jamo (3130–318F)
  97. # Kanbun (3190–319F)
  98. # Bopomofo Extended (31A0–31BF)
  99. # CJK Strokes (31C0–31EF)
  100. # Katakana Phonetic Extensions (31F0–31FF)
  101. # Enclosed CJK Letters and Months (3200–32FF)
  102. # CJK Compatibility (3300–33FF)
  103. # CJK Unified Ideographs Extension A (3400–4DBF)
  104. # Yijing Hexagram Symbols (4DC0–4DFF)
  105. # CJK Unified Ideographs (4E00–9FFF)
  106. # Yi Syllables (A000–A48F)
  107. # Yi Radicals (A490–A4CF)
  108. CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
  109. # Phags-pa (A840–A87F)
  110. Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
  111. # Hangul Syllables (AC00–D7AF)
  112. Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
  113. # CJK Compatibility Ideographs (F900–FAFF)
  114. CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
  115. # CJK Compatibility Forms (FE30–FE4F)
  116. CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
  117. # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
  118. Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
  119. # Supplementary Ideographic Plane 20000–2FFFF
  120. Supplementary_Ideographic_Plane = (
  121. 131072,
  122. 196607,
  123. ) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
  124. ranges = [
  125. Hangul_Jamo,
  126. CJK_Radicals,
  127. Phags_Pa,
  128. Hangul_Syllables,
  129. CJK_Compatibility_Ideographs,
  130. CJK_Compatibility_Forms,
  131. Katakana_Hangul_Halfwidth,
  132. Supplementary_Ideographic_Plane,
  133. ]
  134. def is_cjk(character):
  135. """
  136. Python port of Moses' code to check for CJK character.
  137. >>> CJKChars().ranges
  138. [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
  139. >>> is_cjk(u'\u33fe')
  140. True
  141. >>> is_cjk(u'\uFE5F')
  142. False
  143. :param character: The character that needs to be checked.
  144. :type character: char
  145. :return: bool
  146. """
  147. return any(
  148. [
  149. start <= ord(character) <= end
  150. for start, end in [
  151. (4352, 4607),
  152. (11904, 42191),
  153. (43072, 43135),
  154. (44032, 55215),
  155. (63744, 64255),
  156. (65072, 65103),
  157. (65381, 65500),
  158. (131072, 196607),
  159. ]
  160. ]
  161. )
  162. def xml_escape(text):
  163. """
  164. This function transforms the input text into an "escaped" version suitable
  165. for well-formed XML formatting.
  166. Note that the default xml.sax.saxutils.escape() function don't escape
  167. some characters that Moses does so we have to manually add them to the
  168. entities dictionary.
  169. >>> input_str = ''')| & < > ' " ] ['''
  170. >>> expected_output = ''')| &amp; &lt; &gt; ' " ] ['''
  171. >>> escape(input_str) == expected_output
  172. True
  173. >>> xml_escape(input_str)
  174. ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
  175. :param text: The text that needs to be escaped.
  176. :type text: str
  177. :rtype: str
  178. """
  179. return escape(
  180. text,
  181. entities={
  182. r"'": r"&apos;",
  183. r'"': r"&quot;",
  184. r"|": r"&#124;",
  185. r"[": r"&#91;",
  186. r"]": r"&#93;",
  187. },
  188. )
  189. def xml_unescape(text):
  190. """
  191. This function transforms the "escaped" version suitable
  192. for well-formed XML formatting into humanly-readable string.
  193. Note that the default xml.sax.saxutils.unescape() function don't unescape
  194. some characters that Moses does so we have to manually add them to the
  195. entities dictionary.
  196. >>> from xml.sax.saxutils import unescape
  197. >>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
  198. >>> expected = ''')| & < > \' " ] ['''
  199. >>> xml_unescape(s) == expected
  200. True
  201. :param text: The text that needs to be unescaped.
  202. :type text: str
  203. :rtype: str
  204. """
  205. return unescape(
  206. text,
  207. entities={
  208. r"&apos;": r"'",
  209. r"&quot;": r'"',
  210. r"&#124;": r"|",
  211. r"&#91;": r"[",
  212. r"&#93;": r"]",
  213. },
  214. )
  215. def align_tokens(tokens, sentence):
  216. """
  217. This module attempt to find the offsets of the tokens in *s*, as a sequence
  218. of ``(start, end)`` tuples, given the tokens and also the source string.
  219. >>> from nltk.tokenize import TreebankWordTokenizer
  220. >>> from nltk.tokenize.util import align_tokens
  221. >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
  222. ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
  223. ... "on Saturday.")
  224. >>> tokens = TreebankWordTokenizer().tokenize(s)
  225. >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
  226. ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
  227. ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
  228. ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
  229. ... (123, 131), (131, 132)]
  230. >>> output = list(align_tokens(tokens, s))
  231. >>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
  232. True
  233. >>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
  234. True
  235. >>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
  236. True
  237. :param tokens: The list of strings that are the result of tokenization
  238. :type tokens: list(str)
  239. :param sentence: The original string
  240. :type sentence: str
  241. :rtype: list(tuple(int,int))
  242. """
  243. point = 0
  244. offsets = []
  245. for token in tokens:
  246. try:
  247. start = sentence.index(token, point)
  248. except ValueError:
  249. raise ValueError('substring "{}" not found in "{}"'.format(token, sentence))
  250. point = start + len(token)
  251. offsets.append((start, point))
  252. return offsets