regexp.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. # Natural Language Toolkit: Stemmers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Trevor Cohn <tacohn@cs.mu.oz.au>
  5. # Edward Loper <edloper@gmail.com>
  6. # Steven Bird <stevenbird1@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. from __future__ import unicode_literals
  10. import re
  11. from nltk.stem.api import StemmerI
  12. from nltk.compat import python_2_unicode_compatible
  13. @python_2_unicode_compatible
  14. class RegexpStemmer(StemmerI):
  15. """
  16. A stemmer that uses regular expressions to identify morphological
  17. affixes. Any substrings that match the regular expressions will
  18. be removed.
  19. >>> from nltk.stem import RegexpStemmer
  20. >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4)
  21. >>> st.stem('cars')
  22. 'car'
  23. >>> st.stem('mass')
  24. 'mas'
  25. >>> st.stem('was')
  26. 'was'
  27. >>> st.stem('bee')
  28. 'bee'
  29. >>> st.stem('compute')
  30. 'comput'
  31. >>> st.stem('advisable')
  32. 'advis'
  33. :type regexp: str or regexp
  34. :param regexp: The regular expression that should be used to
  35. identify morphological affixes.
  36. :type min: int
  37. :param min: The minimum length of string to stem
  38. """
  39. def __init__(self, regexp, min=0):
  40. if not hasattr(regexp, 'pattern'):
  41. regexp = re.compile(regexp)
  42. self._regexp = regexp
  43. self._min = min
  44. def stem(self, word):
  45. if len(word) < self._min:
  46. return word
  47. else:
  48. return self._regexp.sub('', word)
  49. def __repr__(self):
  50. return '<RegexpStemmer: {!r}>'.format(self._regexp.pattern)