123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- # coding: utf-8
- #
- # Natural Language Toolkit: Twitter Tokenizer
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Christopher Potts <cgpotts@stanford.edu>
- # Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
- # Pierpaolo Pantone <> (modifications)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- #
- """
- Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
- domains and tasks. The basic logic is this:
- 1. The tuple regex_strings defines a list of regular expression
- strings.
- 2. The regex_strings strings are put, in order, into a compiled
- regular expression object called word_re.
- 3. The tokenization is done by word_re.findall(s), where s is the
- user-supplied string, inside the tokenize() method of the class
- Tokenizer.
- 4. When instantiating Tokenizer objects, there is a single option:
- preserve_case. By default, it is set to True. If it is set to
- False, then the tokenizer will downcase everything except for
- emoticons.
- """
- ######################################################################
- from __future__ import unicode_literals
- import re
- from six import int2byte, unichr
- from six.moves import html_entities
- ######################################################################
- # The following strings are components in the regular expression
- # that is used for tokenizing. It's important that phone_number
- # appears first in the final regex (since it can contain whitespace).
- # It also could matter that tags comes after emoticons, due to the
- # possibility of having text like
- #
- # <:| and some text >:)
- #
- # Most importantly, the final element should always be last, since it
- # does a last ditch whitespace-based tokenization of whatever is left.
- # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?
- # This particular element is used in a couple ways, so we define it
- # with a name:
- EMOTICONS = r"""
- (?:
- [<>]?
- [:;=8] # eyes
- [\-o\*\']? # optional nose
- [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
- |
- [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
- [\-o\*\']? # optional nose
- [:;=8] # eyes
- [<>]?
- |
- <3 # heart
- )"""
- # URL pattern due to John Gruber, modified by Tom Winzig. See
- # https://gist.github.com/winzig/8894715
- URLS = r""" # Capture 1: entire matched URL
- (?:
- https?: # URL protocol and colon
- (?:
- /{1,3} # 1-3 slashes
- | # or
- [a-z0-9%] # Single letter or digit or '%'
- # (Trying not to match e.g. "URI::Escape")
- )
- | # or
- # looks like domain name followed by a slash:
- [a-z0-9.\-]+[.]
- (?:[a-z]{2,13})
- /
- )
- (?: # One or more:
- [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
- | # or
- \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
- |
- \([^\s]+?\) # balanced parens, non-recursive: (...)
- )+
- (?: # End with:
- \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
- |
- \([^\s]+?\) # balanced parens, non-recursive: (...)
- | # or
- [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
- )
- | # OR, the following to match naked domains:
- (?:
- (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
- [a-z0-9]+
- (?:[.\-][a-z0-9]+)*
- [.]
- (?:[a-z]{2,13})
- \b
- /?
- (?!@) # not succeeded by a @,
- # avoid matching "foo.na" in "foo.na@example.com"
- )
- """
- # The components of the tokenizer:
- REGEXPS = (
- URLS,
- # Phone numbers:
- r"""
- (?:
- (?: # (international)
- \+?[01]
- [ *\-.\)]*
- )?
- (?: # (area code)
- [\(]?
- \d{3}
- [ *\-.\)]*
- )?
- \d{3} # exchange
- [ *\-.\)]*
- \d{4} # base
- )""",
- # ASCII Emoticons
- EMOTICONS,
- # HTML tags:
- r"""<[^>\s]+>""",
- # ASCII Arrows
- r"""[\-]+>|<[\-]+""",
- # Twitter username:
- r"""(?:@[\w_]+)""",
- # Twitter hashtags:
- r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
- # email addresses
- r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
- # Remaining word types:
- r"""
- (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
- |
- (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
- |
- (?:[\w_]+) # Words without apostrophes or dashes.
- |
- (?:\.(?:\s*\.){1,}) # Ellipsis dots.
- |
- (?:\S) # Everything else that isn't whitespace.
- """,
- )
- ######################################################################
- # This is the core tokenizing regex:
- WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE)
- # WORD_RE performs poorly on these patterns:
- HANG_RE = re.compile(r'([^a-zA-Z0-9])\1{3,}')
- # The emoticon string gets its own regex so that we can preserve case for
- # them as needed:
- EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE)
- # These are for regularizing HTML entities to Unicode:
- ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);')
- ######################################################################
- # Functions for converting html entities
- ######################################################################
- def _str_to_unicode(text, encoding=None, errors='strict'):
- if encoding is None:
- encoding = 'utf-8'
- if isinstance(text, bytes):
- return text.decode(encoding, errors)
- return text
- def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
- """
- Remove entities from text by converting them to their
- corresponding unicode character.
- :param text: a unicode string or a byte string encoded in the given
- `encoding` (which defaults to 'utf-8').
- :param list keep: list of entity names which should not be replaced.\
- This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
- and named entities (such as `` `` or ``>``).
- :param bool remove_illegal: If `True`, entities that can't be converted are\
- removed. Otherwise, entities that can't be converted are kept "as
- is".
- :returns: A unicode string with the entities removed.
- See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
- >>> from nltk.tokenize.casual import _replace_html_entities
- >>> _replace_html_entities(b'Price: £100')
- 'Price: \\xa3100'
- >>> print(_replace_html_entities(b'Price: £100'))
- Price: £100
- >>>
- """
- def _convert_entity(match):
- entity_body = match.group(3)
- if match.group(1):
- try:
- if match.group(2):
- number = int(entity_body, 16)
- else:
- number = int(entity_body, 10)
- # Numeric character references in the 80-9F range are typically
- # interpreted by browsers as representing the characters mapped
- # to bytes 80-9F in the Windows-1252 encoding. For more info
- # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
- if 0x80 <= number <= 0x9F:
- return int2byte(number).decode('cp1252')
- except ValueError:
- number = None
- else:
- if entity_body in keep:
- return match.group(0)
- else:
- number = html_entities.name2codepoint.get(entity_body)
- if number is not None:
- try:
- return unichr(number)
- except ValueError:
- pass
- return "" if remove_illegal else match.group(0)
- return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
- ######################################################################
- class TweetTokenizer:
- r"""
- Tokenizer for tweets.
- >>> from nltk.tokenize import TweetTokenizer
- >>> tknzr = TweetTokenizer()
- >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
- >>> tknzr.tokenize(s0)
- ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
- Examples using `strip_handles` and `reduce_len parameters`:
- >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
- >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
- >>> tknzr.tokenize(s1)
- [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
- """
- def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
- self.preserve_case = preserve_case
- self.reduce_len = reduce_len
- self.strip_handles = strip_handles
- def tokenize(self, text):
- """
- :param text: str
- :rtype: list(str)
- :return: a tokenized list of strings; concatenating this list returns\
- the original string if `preserve_case=False`
- """
- # Fix HTML character entities:
- text = _replace_html_entities(text)
- # Remove username handles
- if self.strip_handles:
- text = remove_handles(text)
- # Normalize word lengthening
- if self.reduce_len:
- text = reduce_lengthening(text)
- # Shorten problematic sequences of characters
- safe_text = HANG_RE.sub(r'\1\1\1', text)
- # Tokenize:
- words = WORD_RE.findall(safe_text)
- # Possibly alter the case, but avoid changing emoticons like :D into :d:
- if not self.preserve_case:
- words = list(
- map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
- )
- return words
- ######################################################################
- # Normalization Functions
- ######################################################################
- def reduce_lengthening(text):
- """
- Replace repeated character sequences of length 3 or greater with sequences
- of length 3.
- """
- pattern = re.compile(r"(.)\1{2,}")
- return pattern.sub(r"\1\1\1", text)
- def remove_handles(text):
- """
- Remove Twitter username handles from text.
- """
- pattern = re.compile(
- r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
- )
- # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
- return pattern.sub(' ', text)
- ######################################################################
- # Tokenization Function
- ######################################################################
- def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
- """
- Convenience function for wrapping the tokenizer.
- """
- return TweetTokenizer(
- preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles
- ).tokenize(text)
- ###############################################################################
|