api.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. # Natural Language Toolkit: Tokenizer Interface
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. Tokenizer Interface
  10. """
  11. from abc import ABCMeta, abstractmethod
  12. from six import add_metaclass
  13. from nltk.internals import overridden
  14. from nltk.tokenize.util import string_span_tokenize
  15. @add_metaclass(ABCMeta)
  16. class TokenizerI(object):
  17. """
  18. A processing interface for tokenizing a string.
  19. Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
  20. """
  21. @abstractmethod
  22. def tokenize(self, s):
  23. """
  24. Return a tokenized copy of *s*.
  25. :rtype: list of str
  26. """
  27. if overridden(self.tokenize_sents):
  28. return self.tokenize_sents([s])[0]
  29. def span_tokenize(self, s):
  30. """
  31. Identify the tokens using integer offsets ``(start_i, end_i)``,
  32. where ``s[start_i:end_i]`` is the corresponding token.
  33. :rtype: iter(tuple(int, int))
  34. """
  35. raise NotImplementedError()
  36. def tokenize_sents(self, strings):
  37. """
  38. Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
  39. return [self.tokenize(s) for s in strings]
  40. :rtype: list(list(str))
  41. """
  42. return [self.tokenize(s) for s in strings]
  43. def span_tokenize_sents(self, strings):
  44. """
  45. Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
  46. return [self.span_tokenize(s) for s in strings]
  47. :rtype: iter(list(tuple(int, int)))
  48. """
  49. for s in strings:
  50. yield list(self.span_tokenize(s))
  51. class StringTokenizer(TokenizerI):
  52. """A tokenizer that divides a string into substrings by splitting
  53. on the specified string (defined in subclasses).
  54. """
  55. def tokenize(self, s):
  56. return s.split(self._string)
  57. def span_tokenize(self, s):
  58. for span in string_span_tokenize(s, self._string):
  59. yield span