123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- # Multi-Word Expression tokenizer
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Rob Malouf <rmalouf@mail.sdsu.edu>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Multi-Word Expression Tokenizer
- A ``MWETokenizer`` takes a string which has already been divided into tokens and
- retokenizes it, merging multi-word expressions into single tokens, using a lexicon
- of MWEs:
- >>> from nltk.tokenize import MWETokenizer
- >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
- >>> tokenizer.add_mwe(('in', 'spite', 'of'))
- >>> tokenizer.tokenize('Testing testing testing one two three'.split())
- ['Testing', 'testing', 'testing', 'one', 'two', 'three']
- >>> tokenizer.tokenize('This is a test in spite'.split())
- ['This', 'is', 'a', 'test', 'in', 'spite']
- >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
- ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
- """
- from nltk.util import Trie
- from nltk.tokenize.api import TokenizerI
- class MWETokenizer(TokenizerI):
- """A tokenizer that processes tokenized text and merges multi-word expressions
- into single tokens.
- """
- def __init__(self, mwes=None, separator='_'):
- """Initialize the multi-word tokenizer with a list of expressions and a
- separator
- :type mwes: list(list(str))
- :param mwes: A sequence of multi-word expressions to be merged, where
- each MWE is a sequence of strings.
- :type separator: str
- :param separator: String that should be inserted between words in a multi-word
- expression token. (Default is '_')
- """
- if not mwes:
- mwes = []
- self._mwes = Trie(mwes)
- self._separator = separator
- def add_mwe(self, mwe):
- """Add a multi-word expression to the lexicon (stored as a word trie)
- We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
- The key True marks the end of a valid MWE.
- :param mwe: The multi-word expression we're adding into the word trie
- :type mwe: tuple(str) or list(str)
- :Example:
- >>> tokenizer = MWETokenizer()
- >>> tokenizer.add_mwe(('a', 'b'))
- >>> tokenizer.add_mwe(('a', 'b', 'c'))
- >>> tokenizer.add_mwe(('a', 'x'))
- >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
- >>> tokenizer._mwes == expected
- True
- """
- self._mwes.insert(mwe)
- def tokenize(self, text):
- """
- :param text: A list containing tokenized text
- :type text: list(str)
- :return: A list of the tokenized text with multi-words merged together
- :rtype: list(str)
- :Example:
- >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
- >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
- ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
-
- """
- i = 0
- n = len(text)
- result = []
- while i < n:
- if text[i] in self._mwes:
- # possible MWE match
- j = i
- trie = self._mwes
- while j < n and text[j] in trie:
- trie = trie[text[j]]
- j = j + 1
- else:
- if Trie.LEAF in trie:
- # success!
- result.append(self._separator.join(text[i:j]))
- i = j
- else:
- # no match, so backtrack
- result.append(text[i])
- i += 1
- else:
- result.append(text[i])
- i += 1
- return result
|