cistem.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: CISTEM Stemmer for German
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Leonie Weissweiler <l.weissweiler@outlook.de>
  5. # Algorithm: Leonie Weissweiler <l.weissweiler@outlook.de>
  6. # Alexander Fraser <fraser@cis.lmu.de>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. from __future__ import unicode_literals
  10. import re
  11. from nltk.stem.api import StemmerI
  12. from nltk.compat import python_2_unicode_compatible
  13. @python_2_unicode_compatible
  14. class Cistem(StemmerI):
  15. """
  16. CISTEM Stemmer for German
  17. This is the official Python implementation of the CISTEM stemmer.
  18. It is based on the paper
  19. Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German
  20. Based on a Comparative Analysis of Publicly Available Stemmers.
  21. In Proceedings of the German Society for Computational Linguistics and Language
  22. Technology (GSCL)
  23. which can be read here:
  24. http://www.cis.lmu.de/~weissweiler/cistem/
  25. In the paper, we conducted an analysis of publicly available stemmers,
  26. developed two gold standards for German stemming and evaluated the stemmers
  27. based on the two gold standards. We then proposed the stemmer implemented here
  28. and show that it achieves slightly better f-measure than the other stemmers and
  29. is thrice as fast as the Snowball stemmer for German while being about as fast
  30. as most other stemmers.
  31. case_insensitive is a a boolean specifying if case-insensitive stemming
  32. should be used. Case insensitivity improves performance only if words in the
  33. text may be incorrectly upper case. For all-lowercase and correctly cased
  34. text, best performance is achieved by setting case_insensitive for false.
  35. :param case_insensitive: if True, the stemming is case insensitive. False by default.
  36. :type case_insensitive: bool
  37. """
  38. strip_ge = re.compile(r"^ge(.{4,})")
  39. repl_xx = re.compile(r"(.)\1")
  40. strip_emr = re.compile(r"e[mr]$")
  41. strip_nd = re.compile(r"nd$")
  42. strip_t = re.compile(r"t$")
  43. strip_esn = re.compile(r"[esn]$")
  44. repl_xx_back = re.compile(r"(.)\*")
  45. def __init__(self, case_insensitive=False):
  46. self._case_insensitive = case_insensitive
  47. @staticmethod
  48. def replace_to(word):
  49. word = word.replace("sch", "$")
  50. word = word.replace("ei", "%")
  51. word = word.replace("ie", "&")
  52. word = Cistem.repl_xx.sub(r"\1*", word)
  53. return word
  54. @staticmethod
  55. def replace_back(word):
  56. word = Cistem.repl_xx_back.sub(r"\1\1", word)
  57. word = word.replace("%", "ei")
  58. word = word.replace("&", "ie")
  59. word = word.replace("$", "sch")
  60. return word
  61. def stem(self, word):
  62. """
  63. This method takes the word to be stemmed and returns the stemmed word.
  64. :param word: the word that is to be stemmed
  65. :type word: unicode
  66. :return word: the stemmed word
  67. :rtype: unicode
  68. >>> from nltk.stem.cistem import Cistem
  69. >>> stemmer = Cistem()
  70. >>> s1 = "Speicherbehältern"
  71. >>> stemmer.stem(s1)
  72. 'speicherbehalt'
  73. >>> s2 = "Grenzpostens"
  74. >>> stemmer.stem(s2)
  75. 'grenzpost'
  76. >>> s3 = "Ausgefeiltere"
  77. >>> stemmer.stem(s3)
  78. 'ausgefeilt'
  79. >>> stemmer = Cistem(True)
  80. >>> stemmer.stem(s1)
  81. 'speicherbehal'
  82. >>> stemmer.stem(s2)
  83. 'grenzpo'
  84. >>> stemmer.stem(s3)
  85. 'ausgefeil'
  86. """
  87. if len(word) == 0:
  88. return word
  89. upper = word[0].isupper()
  90. word = word.lower()
  91. word = word.replace("ü", "u")
  92. word = word.replace("ö", "o")
  93. word = word.replace("ä", "a")
  94. word = word.replace("ß", "ss")
  95. word = Cistem.strip_ge.sub(r"\1", word)
  96. word = Cistem.replace_to(word)
  97. while len(word) > 3:
  98. if len(word) > 5:
  99. (word, success) = Cistem.strip_emr.subn("", word)
  100. if success != 0:
  101. continue
  102. (word, success) = Cistem.strip_nd.subn("", word)
  103. if success != 0:
  104. continue
  105. if not upper or self._case_insensitive:
  106. (word, success) = Cistem.strip_t.subn("", word)
  107. if success != 0:
  108. continue
  109. (word, success) = Cistem.strip_esn.subn("", word)
  110. if success != 0:
  111. continue
  112. else:
  113. break
  114. word = Cistem.replace_back(word)
  115. return word
  116. def segment(self, word):
  117. """
  118. This method works very similarly to stem (:func:'cistem.stem'). The difference is that in
  119. addition to returning the stem, it also returns the rest that was removed at
  120. the end. To be able to return the stem unchanged so the stem and the rest
  121. can be concatenated to form the original word, all subsitutions that altered
  122. the stem in any other way than by removing letters at the end were left out.
  123. :param word: the word that is to be stemmed
  124. :type word: unicode
  125. :return word: the stemmed word
  126. :rtype: unicode
  127. :return word: the removed suffix
  128. :rtype: unicode
  129. >>> from nltk.stem.cistem import Cistem
  130. >>> stemmer = Cistem()
  131. >>> s1 = "Speicherbehältern"
  132. >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
  133. ('speicherbehält', 'ern')
  134. >>> s2 = "Grenzpostens"
  135. >>> stemmer.segment(s2)
  136. ('grenzpost', 'ens')
  137. >>> s3 = "Ausgefeiltere"
  138. >>> stemmer.segment(s3)
  139. ('ausgefeilt', 'ere')
  140. >>> stemmer = Cistem(True)
  141. >>> print("('" + stemmer.segment(s1)[0] + "', '" + stemmer.segment(s1)[1] + "')")
  142. ('speicherbehäl', 'tern')
  143. >>> stemmer.segment(s2)
  144. ('grenzpo', 'stens')
  145. >>> stemmer.segment(s3)
  146. ('ausgefeil', 'tere')
  147. """
  148. rest_length = 0
  149. if len(word) == 0:
  150. return ("", "")
  151. upper = word[0].isupper()
  152. word = word.lower()
  153. original = word[:]
  154. word = Cistem.replace_to(word)
  155. while len(word) > 3:
  156. if len(word) > 5:
  157. (word, success) = Cistem.strip_emr.subn("", word)
  158. if success != 0:
  159. rest_length += 2
  160. continue
  161. (word, success) = Cistem.strip_nd.subn("", word)
  162. if success != 0:
  163. rest_length += 2
  164. continue
  165. if not upper or self._case_insensitive:
  166. (word, success) = Cistem.strip_t.subn("", word)
  167. if success != 0:
  168. rest_length += 1
  169. continue
  170. (word, success) = Cistem.strip_esn.subn("", word)
  171. if success != 0:
  172. rest_length += 1
  173. continue
  174. else:
  175. break
  176. word = Cistem.replace_back(word)
  177. if rest_length:
  178. rest = original[-rest_length:]
  179. else:
  180. rest = ""
  181. return (word, rest)