123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Tokenizer Utilities
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- # URL: <http://nltk.sourceforge.net>
- # For license information, see LICENSE.TXT
- from re import finditer
- from xml.sax.saxutils import escape, unescape
- def string_span_tokenize(s, sep):
- r"""
- Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
- tuples, by splitting the string at each occurrence of *sep*.
- >>> from nltk.tokenize.util import string_span_tokenize
- >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
- ... two of them.\n\nThanks.'''
- >>> list(string_span_tokenize(s, " "))
- [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
- (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
- :param s: the string to be tokenized
- :type s: str
- :param sep: the token separator
- :type sep: str
- :rtype: iter(tuple(int, int))
- """
- if len(sep) == 0:
- raise ValueError("Token delimiter must not be empty")
- left = 0
- while True:
- try:
- right = s.index(sep, left)
- if right != 0:
- yield left, right
- except ValueError:
- if left != len(s):
- yield left, len(s)
- break
- left = right + len(sep)
- def regexp_span_tokenize(s, regexp):
- r"""
- Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
- tuples, by splitting the string at each successive match of *regexp*.
- >>> from nltk.tokenize.util import regexp_span_tokenize
- >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
- ... two of them.\n\nThanks.'''
- >>> list(regexp_span_tokenize(s, r'\s'))
- [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
- (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
- :param s: the string to be tokenized
- :type s: str
- :param regexp: regular expression that matches token separators (must not be empty)
- :type regexp: str
- :rtype: iter(tuple(int, int))
- """
- left = 0
- for m in finditer(regexp, s):
- right, next = m.span()
- if right != left:
- yield left, right
- left = next
- yield left, len(s)
- def spans_to_relative(spans):
- r"""
- Return a sequence of relative spans, given a sequence of spans.
- >>> from nltk.tokenize import WhitespaceTokenizer
- >>> from nltk.tokenize.util import spans_to_relative
- >>> s = '''Good muffins cost $3.88\nin New York. Please buy me
- ... two of them.\n\nThanks.'''
- >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s)))
- [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
- (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
- :param spans: a sequence of (start, end) offsets of the tokens
- :type spans: iter(tuple(int, int))
- :rtype: iter(tuple(int, int))
- """
- prev = 0
- for left, right in spans:
- yield left - prev, right - left
- prev = right
- class CJKChars(object):
- """
- An object that enumerates the code points of the CJK characters as listed on
- http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
- This is a Python port of the CJK code point enumerations of Moses tokenizer:
- https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
- """
- # Hangul Jamo (1100–11FF)
- Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
- # CJK Radicals Supplement (2E80–2EFF)
- # Kangxi Radicals (2F00–2FDF)
- # Ideographic Description Characters (2FF0–2FFF)
- # CJK Symbols and Punctuation (3000–303F)
- # Hiragana (3040–309F)
- # Katakana (30A0–30FF)
- # Bopomofo (3100–312F)
- # Hangul Compatibility Jamo (3130–318F)
- # Kanbun (3190–319F)
- # Bopomofo Extended (31A0–31BF)
- # CJK Strokes (31C0–31EF)
- # Katakana Phonetic Extensions (31F0–31FF)
- # Enclosed CJK Letters and Months (3200–32FF)
- # CJK Compatibility (3300–33FF)
- # CJK Unified Ideographs Extension A (3400–4DBF)
- # Yijing Hexagram Symbols (4DC0–4DFF)
- # CJK Unified Ideographs (4E00–9FFF)
- # Yi Syllables (A000–A48F)
- # Yi Radicals (A490–A4CF)
- CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
- # Phags-pa (A840–A87F)
- Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
- # Hangul Syllables (AC00–D7AF)
- Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
- # CJK Compatibility Ideographs (F900–FAFF)
- CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
- # CJK Compatibility Forms (FE30–FE4F)
- CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
- # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
- Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
- # Supplementary Ideographic Plane 20000–2FFFF
- Supplementary_Ideographic_Plane = (
- 131072,
- 196607,
- ) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
- ranges = [
- Hangul_Jamo,
- CJK_Radicals,
- Phags_Pa,
- Hangul_Syllables,
- CJK_Compatibility_Ideographs,
- CJK_Compatibility_Forms,
- Katakana_Hangul_Halfwidth,
- Supplementary_Ideographic_Plane,
- ]
- def is_cjk(character):
- """
- Python port of Moses' code to check for CJK character.
- >>> CJKChars().ranges
- [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
- >>> is_cjk(u'\u33fe')
- True
- >>> is_cjk(u'\uFE5F')
- False
- :param character: The character that needs to be checked.
- :type character: char
- :return: bool
- """
- return any(
- [
- start <= ord(character) <= end
- for start, end in [
- (4352, 4607),
- (11904, 42191),
- (43072, 43135),
- (44032, 55215),
- (63744, 64255),
- (65072, 65103),
- (65381, 65500),
- (131072, 196607),
- ]
- ]
- )
- def xml_escape(text):
- """
- This function transforms the input text into an "escaped" version suitable
- for well-formed XML formatting.
- Note that the default xml.sax.saxutils.escape() function don't escape
- some characters that Moses does so we have to manually add them to the
- entities dictionary.
- >>> input_str = ''')| & < > ' " ] ['''
- >>> expected_output = ''')| & < > ' " ] ['''
- >>> escape(input_str) == expected_output
- True
- >>> xml_escape(input_str)
- ')| & < > ' " ] ['
- :param text: The text that needs to be escaped.
- :type text: str
- :rtype: str
- """
- return escape(
- text,
- entities={
- r"'": r"'",
- r'"': r""",
- r"|": r"|",
- r"[": r"[",
- r"]": r"]",
- },
- )
- def xml_unescape(text):
- """
- This function transforms the "escaped" version suitable
- for well-formed XML formatting into humanly-readable string.
- Note that the default xml.sax.saxutils.unescape() function don't unescape
- some characters that Moses does so we have to manually add them to the
- entities dictionary.
- >>> from xml.sax.saxutils import unescape
- >>> s = ')| & < > ' " ] ['
- >>> expected = ''')| & < > \' " ] ['''
- >>> xml_unescape(s) == expected
- True
- :param text: The text that needs to be unescaped.
- :type text: str
- :rtype: str
- """
- return unescape(
- text,
- entities={
- r"'": r"'",
- r""": r'"',
- r"|": r"|",
- r"[": r"[",
- r"]": r"]",
- },
- )
- def align_tokens(tokens, sentence):
- """
- This module attempt to find the offsets of the tokens in *s*, as a sequence
- of ``(start, end)`` tuples, given the tokens and also the source string.
- >>> from nltk.tokenize import TreebankWordTokenizer
- >>> from nltk.tokenize.util import align_tokens
- >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
- ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
- ... "on Saturday.")
- >>> tokens = TreebankWordTokenizer().tokenize(s)
- >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
- ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
- ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
- ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
- ... (123, 131), (131, 132)]
- >>> output = list(align_tokens(tokens, s))
- >>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
- True
- >>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
- True
- >>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
- True
- :param tokens: The list of strings that are the result of tokenization
- :type tokens: list(str)
- :param sentence: The original string
- :type sentence: str
- :rtype: list(tuple(int,int))
- """
- point = 0
- offsets = []
- for token in tokens:
- try:
- start = sentence.index(token, point)
- except ValueError:
- raise ValueError('substring "{}" not found in "{}"'.format(token, sentence))
- point = start + len(token)
- offsets.append((start, point))
- return offsets
|