mwe.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # Multi-Word Expression tokenizer
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Rob Malouf <rmalouf@mail.sdsu.edu>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. Multi-Word Expression Tokenizer
  9. A ``MWETokenizer`` takes a string which has already been divided into tokens and
  10. retokenizes it, merging multi-word expressions into single tokens, using a lexicon
  11. of MWEs:
  12. >>> from nltk.tokenize import MWETokenizer
  13. >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
  14. >>> tokenizer.add_mwe(('in', 'spite', 'of'))
  15. >>> tokenizer.tokenize('Testing testing testing one two three'.split())
  16. ['Testing', 'testing', 'testing', 'one', 'two', 'three']
  17. >>> tokenizer.tokenize('This is a test in spite'.split())
  18. ['This', 'is', 'a', 'test', 'in', 'spite']
  19. >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
  20. ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
  21. """
  22. from nltk.util import Trie
  23. from nltk.tokenize.api import TokenizerI
  24. class MWETokenizer(TokenizerI):
  25. """A tokenizer that processes tokenized text and merges multi-word expressions
  26. into single tokens.
  27. """
  28. def __init__(self, mwes=None, separator='_'):
  29. """Initialize the multi-word tokenizer with a list of expressions and a
  30. separator
  31. :type mwes: list(list(str))
  32. :param mwes: A sequence of multi-word expressions to be merged, where
  33. each MWE is a sequence of strings.
  34. :type separator: str
  35. :param separator: String that should be inserted between words in a multi-word
  36. expression token. (Default is '_')
  37. """
  38. if not mwes:
  39. mwes = []
  40. self._mwes = Trie(mwes)
  41. self._separator = separator
  42. def add_mwe(self, mwe):
  43. """Add a multi-word expression to the lexicon (stored as a word trie)
  44. We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
  45. The key True marks the end of a valid MWE.
  46. :param mwe: The multi-word expression we're adding into the word trie
  47. :type mwe: tuple(str) or list(str)
  48. :Example:
  49. >>> tokenizer = MWETokenizer()
  50. >>> tokenizer.add_mwe(('a', 'b'))
  51. >>> tokenizer.add_mwe(('a', 'b', 'c'))
  52. >>> tokenizer.add_mwe(('a', 'x'))
  53. >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
  54. >>> tokenizer._mwes == expected
  55. True
  56. """
  57. self._mwes.insert(mwe)
  58. def tokenize(self, text):
  59. """
  60. :param text: A list containing tokenized text
  61. :type text: list(str)
  62. :return: A list of the tokenized text with multi-words merged together
  63. :rtype: list(str)
  64. :Example:
  65. >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
  66. >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
  67. ['An', "hors+d'oeuvre", 'tonight,', 'sir?']
  68. """
  69. i = 0
  70. n = len(text)
  71. result = []
  72. while i < n:
  73. if text[i] in self._mwes:
  74. # possible MWE match
  75. j = i
  76. trie = self._mwes
  77. while j < n and text[j] in trie:
  78. trie = trie[text[j]]
  79. j = j + 1
  80. else:
  81. if Trie.LEAF in trie:
  82. # success!
  83. result.append(self._separator.join(text[i:j]))
  84. i = j
  85. else:
  86. # no match, so backtrack
  87. result.append(text[i])
  88. i += 1
  89. else:
  90. result.append(text[i])
  91. i += 1
  92. return result