treebank.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. # Natural Language Toolkit: Tokenizers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
  6. #
  7. # URL: <http://nltk.sourceforge.net>
  8. # For license information, see LICENSE.TXT
  9. r"""
  10. Penn Treebank Tokenizer
  11. The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
  12. This implementation is a port of the tokenizer sed script written by Robert McIntyre
  13. and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
  14. """
  15. import re
  16. from nltk.tokenize.api import TokenizerI
  17. from nltk.tokenize.util import align_tokens
  18. class MacIntyreContractions:
  19. """
  20. List of contractions adapted from Robert MacIntyre's tokenizer.
  21. """
  22. CONTRACTIONS2 = [
  23. r"(?i)\b(can)(?#X)(not)\b",
  24. r"(?i)\b(d)(?#X)('ye)\b",
  25. r"(?i)\b(gim)(?#X)(me)\b",
  26. r"(?i)\b(gon)(?#X)(na)\b",
  27. r"(?i)\b(got)(?#X)(ta)\b",
  28. r"(?i)\b(lem)(?#X)(me)\b",
  29. r"(?i)\b(mor)(?#X)('n)\b",
  30. r"(?i)\b(wan)(?#X)(na)\s",
  31. ]
  32. CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
  33. CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
  34. class TreebankWordTokenizer(TokenizerI):
  35. """
  36. The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
  37. This is the method that is invoked by ``word_tokenize()``. It assumes that the
  38. text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
  39. This tokenizer performs the following steps:
  40. - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
  41. - treat most punctuation characters as separate tokens
  42. - split off commas and single quotes, when followed by whitespace
  43. - separate periods that appear at the end of line
  44. >>> from nltk.tokenize import TreebankWordTokenizer
  45. >>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'''
  46. >>> TreebankWordTokenizer().tokenize(s)
  47. ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
  48. >>> s = "They'll save and invest more."
  49. >>> TreebankWordTokenizer().tokenize(s)
  50. ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
  51. >>> s = "hi, my name can't hello,"
  52. >>> TreebankWordTokenizer().tokenize(s)
  53. ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
  54. """
  55. # starting quotes
  56. STARTING_QUOTES = [
  57. (re.compile(r'^\"'), r'``'),
  58. (re.compile(r'(``)'), r' \1 '),
  59. (re.compile(r"([ \(\[{<])(\"|\'{2})"), r'\1 `` '),
  60. ]
  61. # punctuation
  62. PUNCTUATION = [
  63. (re.compile(r'([:,])([^\d])'), r' \1 \2'),
  64. (re.compile(r'([:,])$'), r' \1 '),
  65. (re.compile(r'\.\.\.'), r' ... '),
  66. (re.compile(r'[;@#$%&]'), r' \g<0> '),
  67. (
  68. re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
  69. r'\1 \2\3 ',
  70. ), # Handles the final period.
  71. (re.compile(r'[?!]'), r' \g<0> '),
  72. (re.compile(r"([^'])' "), r"\1 ' "),
  73. ]
  74. # Pads parentheses
  75. PARENS_BRACKETS = (re.compile(r'[\]\[\(\)\{\}\<\>]'), r' \g<0> ')
  76. # Optionally: Convert parentheses, brackets and converts them to PTB symbols.
  77. CONVERT_PARENTHESES = [
  78. (re.compile(r'\('), '-LRB-'),
  79. (re.compile(r'\)'), '-RRB-'),
  80. (re.compile(r'\['), '-LSB-'),
  81. (re.compile(r'\]'), '-RSB-'),
  82. (re.compile(r'\{'), '-LCB-'),
  83. (re.compile(r'\}'), '-RCB-'),
  84. ]
  85. DOUBLE_DASHES = (re.compile(r'--'), r' -- ')
  86. # ending quotes
  87. ENDING_QUOTES = [
  88. (re.compile(r'"'), " '' "),
  89. (re.compile(r'(\S)(\'\')'), r'\1 \2 '),
  90. (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
  91. (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
  92. ]
  93. # List of contractions adapted from Robert MacIntyre's tokenizer.
  94. _contractions = MacIntyreContractions()
  95. CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
  96. CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
  97. def tokenize(self, text, convert_parentheses=False, return_str=False):
  98. for regexp, substitution in self.STARTING_QUOTES:
  99. text = regexp.sub(substitution, text)
  100. for regexp, substitution in self.PUNCTUATION:
  101. text = regexp.sub(substitution, text)
  102. # Handles parentheses.
  103. regexp, substitution = self.PARENS_BRACKETS
  104. text = regexp.sub(substitution, text)
  105. # Optionally convert parentheses
  106. if convert_parentheses:
  107. for regexp, substitution in self.CONVERT_PARENTHESES:
  108. text = regexp.sub(substitution, text)
  109. # Handles double dash.
  110. regexp, substitution = self.DOUBLE_DASHES
  111. text = regexp.sub(substitution, text)
  112. # add extra space to make things easier
  113. text = " " + text + " "
  114. for regexp, substitution in self.ENDING_QUOTES:
  115. text = regexp.sub(substitution, text)
  116. for regexp in self.CONTRACTIONS2:
  117. text = regexp.sub(r' \1 \2 ', text)
  118. for regexp in self.CONTRACTIONS3:
  119. text = regexp.sub(r' \1 \2 ', text)
  120. # We are not using CONTRACTIONS4 since
  121. # they are also commented out in the SED scripts
  122. # for regexp in self._contractions.CONTRACTIONS4:
  123. # text = regexp.sub(r' \1 \2 \3 ', text)
  124. return text if return_str else text.split()
  125. def span_tokenize(self, text):
  126. """
  127. Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
  128. >>> from nltk.tokenize import TreebankWordTokenizer
  129. >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).'''
  130. >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
  131. ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
  132. ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
  133. ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
  134. >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
  135. True
  136. >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
  137. ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
  138. ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
  139. >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
  140. True
  141. Additional example
  142. >>> from nltk.tokenize import TreebankWordTokenizer
  143. >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\\n each in New (York)."'''
  144. >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
  145. ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
  146. ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
  147. ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
  148. ... (82, 83), (83, 84)]
  149. >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
  150. True
  151. >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
  152. ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
  153. ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
  154. >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
  155. True
  156. """
  157. raw_tokens = self.tokenize(text)
  158. # Convert converted quotes back to original double quotes
  159. # Do this only if original text contains double quote(s) or double
  160. # single-quotes (because '' might be transformed to `` if it is
  161. # treated as starting quotes).
  162. if ('"' in text) or ("''" in text):
  163. # Find double quotes and converted quotes
  164. matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
  165. # Replace converted quotes back to double quotes
  166. tokens = [
  167. matched.pop(0) if tok in ['"', "``", "''"] else tok
  168. for tok in raw_tokens
  169. ]
  170. else:
  171. tokens = raw_tokens
  172. for tok in align_tokens(tokens, text):
  173. yield tok
  174. class TreebankWordDetokenizer(TokenizerI):
  175. """
  176. The Treebank detokenizer uses the reverse regex operations corresponding to
  177. the Treebank tokenizer's regexes.
  178. Note:
  179. - There're additional assumption mades when undoing the padding of [;@#$%&]
  180. punctuation symbols that isn't presupposed in the TreebankTokenizer.
  181. - There're additional regexes added in reversing the parentheses tokenization,
  182. - the r'([\]\)\}\>])\s([:;,.])' removes the additional right padding added
  183. to the closing parentheses precedding [:;,.].
  184. - It's not possible to return the original whitespaces as they were because
  185. there wasn't explicit records of where '\n', '\t' or '\s' were removed at
  186. the text.split() operation.
  187. >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
  188. >>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.'''
  189. >>> d = TreebankWordDetokenizer()
  190. >>> t = TreebankWordTokenizer()
  191. >>> toks = t.tokenize(s)
  192. >>> d.detokenize(toks)
  193. 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
  194. The MXPOST parentheses substitution can be undone using the `convert_parentheses`
  195. parameter:
  196. >>> s = '''Good muffins cost $3.88\\nin New (York). Please (buy) me\\ntwo of them.\\n(Thanks).'''
  197. >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
  198. ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
  199. ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
  200. >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
  201. True
  202. >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
  203. >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
  204. True
  205. During tokenization it's safe to add more spaces but during detokenization,
  206. simply undoing the padding doesn't really help.
  207. - During tokenization, left and right pad is added to [!?], when
  208. detokenizing, only left shift the [!?] is needed.
  209. Thus (re.compile(r'\s([?!])'), r'\g<1>')
  210. - During tokenization [:,] are left and right padded but when detokenizing,
  211. only left shift is necessary and we keep right pad after comma/colon
  212. if the string after is a non-digit.
  213. Thus (re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')
  214. >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
  215. >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
  216. >>> twd = TreebankWordDetokenizer()
  217. >>> twd.detokenize(toks)
  218. "hello, i can't feel my feet! Help!!"
  219. >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
  220. ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
  221. >>> twd.detokenize(toks)
  222. "hello, i can't feel; my feet! Help!! He said: Help, help?!"
  223. """
  224. _contractions = MacIntyreContractions()
  225. CONTRACTIONS2 = [
  226. re.compile(pattern.replace('(?#X)', '\s'))
  227. for pattern in _contractions.CONTRACTIONS2
  228. ]
  229. CONTRACTIONS3 = [
  230. re.compile(pattern.replace('(?#X)', '\s'))
  231. for pattern in _contractions.CONTRACTIONS3
  232. ]
  233. # ending quotes
  234. ENDING_QUOTES = [
  235. (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
  236. (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
  237. (re.compile(r'(\S)(\'\')'), r'\1\2 '),
  238. (re.compile(r" '' "), '"'),
  239. ]
  240. # Handles double dashes
  241. DOUBLE_DASHES = (re.compile(r' -- '), r'--')
  242. # Optionally: Convert parentheses, brackets and converts them from PTB symbols.
  243. CONVERT_PARENTHESES = [
  244. (re.compile('-LRB-'), '('),
  245. (re.compile('-RRB-'), ')'),
  246. (re.compile('-LSB-'), '['),
  247. (re.compile('-RSB-'), ']'),
  248. (re.compile('-LCB-'), '{'),
  249. (re.compile('-RCB-'), '}'),
  250. ]
  251. # Undo padding on parentheses.
  252. PARENS_BRACKETS = [
  253. (re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'),
  254. (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '),
  255. (re.compile(r'([\]\)\}\>])\s([:;,.])'), r'\1\2'),
  256. ]
  257. # punctuation
  258. PUNCTUATION = [
  259. (re.compile(r"([^'])\s'\s"), r"\1' "),
  260. (re.compile(r'\s([?!])'), r'\g<1>'), # Strip left pad for [?!]
  261. # (re.compile(r'\s([?!])\s'), r'\g<1>'),
  262. (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r'\1\2\3'),
  263. # When tokenizing, [;@#$%&] are padded with whitespace regardless of
  264. # whether there are spaces before or after them.
  265. # But during detokenization, we need to distinguish between left/right
  266. # pad, so we split this up.
  267. (re.compile(r'\s([#$])\s'), r' \g<1>'), # Left pad.
  268. (re.compile(r'\s([;%])\s'), r'\g<1> '), # Right pad.
  269. (re.compile(r'\s([&])\s'), r' \g<1> '), # Unknown pad.
  270. (re.compile(r'\s\.\.\.\s'), r'...'),
  271. (re.compile(r'\s([:,])\s$'), r'\1'),
  272. (
  273. re.compile(r'\s([:,])\s([^\d])'),
  274. r'\1 \2',
  275. ) # Keep right pad after comma/colon before non-digits.
  276. # (re.compile(r'\s([:,])\s([^\d])'), r'\1\2')
  277. ]
  278. # starting quotes
  279. STARTING_QUOTES = [
  280. (re.compile(r'([ (\[{<])\s``'), r'\1"'),
  281. (re.compile(r'\s(``)\s'), r'\1'),
  282. (re.compile(r'^``'), r'\"'),
  283. ]
  284. def tokenize(self, tokens, convert_parentheses=False):
  285. """
  286. Treebank detokenizer, created by undoing the regexes from
  287. the TreebankWordTokenizer.tokenize.
  288. :param tokens: A list of strings, i.e. tokenized text.
  289. :type tokens: list(str)
  290. :return: str
  291. """
  292. text = ' '.join(tokens)
  293. # Reverse the contractions regexes.
  294. # Note: CONTRACTIONS4 are not used in tokenization.
  295. for regexp in self.CONTRACTIONS3:
  296. text = regexp.sub(r'\1\2', text)
  297. for regexp in self.CONTRACTIONS2:
  298. text = regexp.sub(r'\1\2', text)
  299. # Reverse the regexes applied for ending quotes.
  300. for regexp, substitution in self.ENDING_QUOTES:
  301. text = regexp.sub(substitution, text)
  302. # Undo the space padding.
  303. text = text.strip()
  304. # Reverse the padding on double dashes.
  305. regexp, substitution = self.DOUBLE_DASHES
  306. text = regexp.sub(substitution, text)
  307. if convert_parentheses:
  308. for regexp, substitution in self.CONVERT_PARENTHESES:
  309. text = regexp.sub(substitution, text)
  310. # Reverse the padding regexes applied for parenthesis/brackets.
  311. for regexp, substitution in self.PARENS_BRACKETS:
  312. text = regexp.sub(substitution, text)
  313. # Reverse the regexes applied for punctuations.
  314. for regexp, substitution in self.PUNCTUATION:
  315. text = regexp.sub(substitution, text)
  316. # Reverse the regexes applied for starting quotes.
  317. for regexp, substitution in self.STARTING_QUOTES:
  318. text = regexp.sub(substitution, text)
  319. return text.strip()
  320. def detokenize(self, tokens, convert_parentheses=False):
  321. """ Duck-typing the abstract *tokenize()*."""
  322. return self.tokenize(tokens, convert_parentheses)