casual.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. # coding: utf-8
  2. #
  3. # Natural Language Toolkit: Twitter Tokenizer
  4. #
  5. # Copyright (C) 2001-2019 NLTK Project
  6. # Author: Christopher Potts <cgpotts@stanford.edu>
  7. # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
  8. # Pierpaolo Pantone <> (modifications)
  9. # URL: <http://nltk.org/>
  10. # For license information, see LICENSE.TXT
  11. #
  12. """
  13. Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
  14. domains and tasks. The basic logic is this:
  15. 1. The tuple regex_strings defines a list of regular expression
  16. strings.
  17. 2. The regex_strings strings are put, in order, into a compiled
  18. regular expression object called word_re.
  19. 3. The tokenization is done by word_re.findall(s), where s is the
  20. user-supplied string, inside the tokenize() method of the class
  21. Tokenizer.
  22. 4. When instantiating Tokenizer objects, there is a single option:
  23. preserve_case. By default, it is set to True. If it is set to
  24. False, then the tokenizer will downcase everything except for
  25. emoticons.
  26. """
  27. ######################################################################
  28. from __future__ import unicode_literals
  29. import re
  30. from six import int2byte, unichr
  31. from six.moves import html_entities
  32. ######################################################################
  33. # The following strings are components in the regular expression
  34. # that is used for tokenizing. It's important that phone_number
  35. # appears first in the final regex (since it can contain whitespace).
  36. # It also could matter that tags comes after emoticons, due to the
  37. # possibility of having text like
  38. #
  39. # <:| and some text >:)
  40. #
  41. # Most importantly, the final element should always be last, since it
  42. # does a last ditch whitespace-based tokenization of whatever is left.
  43. # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
  44. # This particular element is used in a couple ways, so we define it
  45. # with a name:
  46. EMOTICONS = r"""
  47. (?:
  48. [<>]?
  49. [:;=8] # eyes
  50. [\-o\*\']? # optional nose
  51. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  52. |
  53. [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
  54. [\-o\*\']? # optional nose
  55. [:;=8] # eyes
  56. [<>]?
  57. |
  58. <3 # heart
  59. )"""
  60. # URL pattern due to John Gruber, modified by Tom Winzig. See
  61. # https://gist.github.com/winzig/8894715
  62. URLS = r""" # Capture 1: entire matched URL
  63. (?:
  64. https?: # URL protocol and colon
  65. (?:
  66. /{1,3} # 1-3 slashes
  67. | # or
  68. [a-z0-9%] # Single letter or digit or '%'
  69. # (Trying not to match e.g. "URI::Escape")
  70. )
  71. | # or
  72. # looks like domain name followed by a slash:
  73. [a-z0-9.\-]+[.]
  74. (?:[a-z]{2,13})
  75. /
  76. )
  77. (?: # One or more:
  78. [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
  79. | # or
  80. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  81. |
  82. \([^\s]+?\) # balanced parens, non-recursive: (...)
  83. )+
  84. (?: # End with:
  85. \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
  86. |
  87. \([^\s]+?\) # balanced parens, non-recursive: (...)
  88. | # or
  89. [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
  90. )
  91. | # OR, the following to match naked domains:
  92. (?:
  93. (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
  94. [a-z0-9]+
  95. (?:[.\-][a-z0-9]+)*
  96. [.]
  97. (?:[a-z]{2,13})
  98. \b
  99. /?
  100. (?!@) # not succeeded by a @,
  101. # avoid matching "foo.na" in "foo.na@example.com"
  102. )
  103. """
  104. # The components of the tokenizer:
  105. REGEXPS = (
  106. URLS,
  107. # Phone numbers:
  108. r"""
  109. (?:
  110. (?: # (international)
  111. \+?[01]
  112. [ *\-.\)]*
  113. )?
  114. (?: # (area code)
  115. [\(]?
  116. \d{3}
  117. [ *\-.\)]*
  118. )?
  119. \d{3} # exchange
  120. [ *\-.\)]*
  121. \d{4} # base
  122. )""",
  123. # ASCII Emoticons
  124. EMOTICONS,
  125. # HTML tags:
  126. r"""<[^>\s]+>""",
  127. # ASCII Arrows
  128. r"""[\-]+>|<[\-]+""",
  129. # Twitter username:
  130. r"""(?:@[\w_]+)""",
  131. # Twitter hashtags:
  132. r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
  133. # email addresses
  134. r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
  135. # Remaining word types:
  136. r"""
  137. (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
  138. |
  139. (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
  140. |
  141. (?:[\w_]+) # Words without apostrophes or dashes.
  142. |
  143. (?:\.(?:\s*\.){1,}) # Ellipsis dots.
  144. |
  145. (?:\S) # Everything else that isn't whitespace.
  146. """,
  147. )
  148. ######################################################################
  149. # This is the core tokenizing regex:
  150. WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE)
  151. # WORD_RE performs poorly on these patterns:
  152. HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
  153. # The emoticon string gets its own regex so that we can preserve case for
  154. # them as needed:
  155. EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
  156. # These are for regularizing HTML entities to Unicode:
  157. ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
  158. ######################################################################
  159. # Functions for converting html entities
  160. ######################################################################
  161. def _str_to_unicode(text, encoding=None, errors='strict'):
  162. if encoding is None:
  163. encoding = 'utf-8'
  164. if isinstance(text, bytes):
  165. return text.decode(encoding, errors)
  166. return text
  167. def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
  168. """
  169. Remove entities from text by converting them to their
  170. corresponding unicode character.
  171. :param text: a unicode string or a byte string encoded in the given
  172. `encoding` (which defaults to 'utf-8').
  173. :param list keep: list of entity names which should not be replaced.\
  174. This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
  175. and named entities (such as ``&nbsp;`` or ``&gt;``).
  176. :param bool remove_illegal: If `True`, entities that can't be converted are\
  177. removed. Otherwise, entities that can't be converted are kept "as
  178. is".
  179. :returns: A unicode string with the entities removed.
  180. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
  181. >>> from nltk.tokenize.casual import _replace_html_entities
  182. >>> _replace_html_entities(b'Price: &pound;100')
  183. 'Price: \\xa3100'
  184. >>> print(_replace_html_entities(b'Price: &pound;100'))
  185. Price: £100
  186. >>>
  187. """
  188. def _convert_entity(match):
  189. entity_body = match.group(3)
  190. if match.group(1):
  191. try:
  192. if match.group(2):
  193. number = int(entity_body, 16)
  194. else:
  195. number = int(entity_body, 10)
  196. # Numeric character references in the 80-9F range are typically
  197. # interpreted by browsers as representing the characters mapped
  198. # to bytes 80-9F in the Windows-1252 encoding. For more info
  199. # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
  200. if 0x80 <= number <= 0x9F:
  201. return int2byte(number).decode('cp1252')
  202. except ValueError:
  203. number = None
  204. else:
  205. if entity_body in keep:
  206. return match.group(0)
  207. else:
  208. number = html_entities.name2codepoint.get(entity_body)
  209. if number is not None:
  210. try:
  211. return unichr(number)
  212. except ValueError:
  213. pass
  214. return "" if remove_illegal else match.group(0)
  215. return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
  216. ######################################################################
  217. class TweetTokenizer:
  218. r"""
  219. Tokenizer for tweets.
  220. >>> from nltk.tokenize import TweetTokenizer
  221. >>> tknzr = TweetTokenizer()
  222. >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
  223. >>> tknzr.tokenize(s0)
  224. ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
  225. Examples using `strip_handles` and `reduce_len parameters`:
  226. >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
  227. >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
  228. >>> tknzr.tokenize(s1)
  229. [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
  230. """
  231. def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
  232. self.preserve_case = preserve_case
  233. self.reduce_len = reduce_len
  234. self.strip_handles = strip_handles
  235. def tokenize(self, text):
  236. """
  237. :param text: str
  238. :rtype: list(str)
  239. :return: a tokenized list of strings; concatenating this list returns\
  240. the original string if `preserve_case=False`
  241. """
  242. # Fix HTML character entities:
  243. text = _replace_html_entities(text)
  244. # Remove username handles
  245. if self.strip_handles:
  246. text = remove_handles(text)
  247. # Normalize word lengthening
  248. if self.reduce_len:
  249. text = reduce_lengthening(text)
  250. # Shorten problematic sequences of characters
  251. safe_text = HANG_RE.sub(r'\1\1\1', text)
  252. # Tokenize:
  253. words = WORD_RE.findall(safe_text)
  254. # Possibly alter the case, but avoid changing emoticons like :D into :d:
  255. if not self.preserve_case:
  256. words = list(
  257. map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
  258. )
  259. return words
  260. ######################################################################
  261. # Normalization Functions
  262. ######################################################################
  263. def reduce_lengthening(text):
  264. """
  265. Replace repeated character sequences of length 3 or greater with sequences
  266. of length 3.
  267. """
  268. pattern = re.compile(r"(.)\1{2,}")
  269. return pattern.sub(r"\1\1\1", text)
  270. def remove_handles(text):
  271. """
  272. Remove Twitter username handles from text.
  273. """
  274. pattern = re.compile(
  275. r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
  276. )
  277. # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
  278. return pattern.sub(' ', text)
  279. ######################################################################
  280. # Tokenization Function
  281. ######################################################################
  282. def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
  283. """
  284. Convenience function for wrapping the tokenizer.
  285. """
  286. return TweetTokenizer(
  287. preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles
  288. ).tokenize(text)
  289. ###############################################################################