lancaster.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. # Natural Language Toolkit: Stemmers
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Steven Tomcavage <stomcava@law.upenn.edu>
  5. # URL: <http://nltk.org/>
  6. # For license information, see LICENSE.TXT
  7. """
  8. A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
  9. Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
  10. """
  11. from __future__ import unicode_literals
  12. import re
  13. from nltk.stem.api import StemmerI
  14. from nltk.compat import python_2_unicode_compatible
  15. @python_2_unicode_compatible
  16. class LancasterStemmer(StemmerI):
  17. """
  18. Lancaster Stemmer
  19. >>> from nltk.stem.lancaster import LancasterStemmer
  20. >>> st = LancasterStemmer()
  21. >>> st.stem('maximum') # Remove "-um" when word is intact
  22. 'maxim'
  23. >>> st.stem('presumably') # Don't remove "-um" when word is not intact
  24. 'presum'
  25. >>> st.stem('multiply') # No action taken if word ends with "-ply"
  26. 'multiply'
  27. >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
  28. 'provid'
  29. >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
  30. 'ow'
  31. >>> st.stem('ear') # ditto
  32. 'ear'
  33. >>> st.stem('saying') # Words starting with consonant must contain at least 3
  34. 'say'
  35. >>> st.stem('crying') # letters and one of those letters must be a vowel
  36. 'cry'
  37. >>> st.stem('string') # ditto
  38. 'string'
  39. >>> st.stem('meant') # ditto
  40. 'meant'
  41. >>> st.stem('cement') # ditto
  42. 'cem'
  43. >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
  44. >>> st_pre.stem('kilometer') # Test Prefix
  45. 'met'
  46. >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
  47. >>> st_custom.stem("ness") # Change s to t
  48. 'nest'
  49. """
  50. # The rule list is static since it doesn't change between instances
  51. default_rule_tuple = (
  52. "ai*2.", # -ia > - if intact
  53. "a*1.", # -a > - if intact
  54. "bb1.", # -bb > -b
  55. "city3s.", # -ytic > -ys
  56. "ci2>", # -ic > -
  57. "cn1t>", # -nc > -nt
  58. "dd1.", # -dd > -d
  59. "dei3y>", # -ied > -y
  60. "deec2ss.", # -ceed >", -cess
  61. "dee1.", # -eed > -ee
  62. "de2>", # -ed > -
  63. "dooh4>", # -hood > -
  64. "e1>", # -e > -
  65. "feil1v.", # -lief > -liev
  66. "fi2>", # -if > -
  67. "gni3>", # -ing > -
  68. "gai3y.", # -iag > -y
  69. "ga2>", # -ag > -
  70. "gg1.", # -gg > -g
  71. "ht*2.", # -th > - if intact
  72. "hsiug5ct.", # -guish > -ct
  73. "hsi3>", # -ish > -
  74. "i*1.", # -i > - if intact
  75. "i1y>", # -i > -y
  76. "ji1d.", # -ij > -id -- see nois4j> & vis3j>
  77. "juf1s.", # -fuj > -fus
  78. "ju1d.", # -uj > -ud
  79. "jo1d.", # -oj > -od
  80. "jeh1r.", # -hej > -her
  81. "jrev1t.", # -verj > -vert
  82. "jsim2t.", # -misj > -mit
  83. "jn1d.", # -nj > -nd
  84. "j1s.", # -j > -s
  85. "lbaifi6.", # -ifiabl > -
  86. "lbai4y.", # -iabl > -y
  87. "lba3>", # -abl > -
  88. "lbi3.", # -ibl > -
  89. "lib2l>", # -bil > -bl
  90. "lc1.", # -cl > c
  91. "lufi4y.", # -iful > -y
  92. "luf3>", # -ful > -
  93. "lu2.", # -ul > -
  94. "lai3>", # -ial > -
  95. "lau3>", # -ual > -
  96. "la2>", # -al > -
  97. "ll1.", # -ll > -l
  98. "mui3.", # -ium > -
  99. "mu*2.", # -um > - if intact
  100. "msi3>", # -ism > -
  101. "mm1.", # -mm > -m
  102. "nois4j>", # -sion > -j
  103. "noix4ct.", # -xion > -ct
  104. "noi3>", # -ion > -
  105. "nai3>", # -ian > -
  106. "na2>", # -an > -
  107. "nee0.", # protect -een
  108. "ne2>", # -en > -
  109. "nn1.", # -nn > -n
  110. "pihs4>", # -ship > -
  111. "pp1.", # -pp > -p
  112. "re2>", # -er > -
  113. "rae0.", # protect -ear
  114. "ra2.", # -ar > -
  115. "ro2>", # -or > -
  116. "ru2>", # -ur > -
  117. "rr1.", # -rr > -r
  118. "rt1>", # -tr > -t
  119. "rei3y>", # -ier > -y
  120. "sei3y>", # -ies > -y
  121. "sis2.", # -sis > -s
  122. "si2>", # -is > -
  123. "ssen4>", # -ness > -
  124. "ss0.", # protect -ss
  125. "suo3>", # -ous > -
  126. "su*2.", # -us > - if intact
  127. "s*1>", # -s > - if intact
  128. "s0.", # -s > -s
  129. "tacilp4y.", # -plicat > -ply
  130. "ta2>", # -at > -
  131. "tnem4>", # -ment > -
  132. "tne3>", # -ent > -
  133. "tna3>", # -ant > -
  134. "tpir2b.", # -ript > -rib
  135. "tpro2b.", # -orpt > -orb
  136. "tcud1.", # -duct > -duc
  137. "tpmus2.", # -sumpt > -sum
  138. "tpec2iv.", # -cept > -ceiv
  139. "tulo2v.", # -olut > -olv
  140. "tsis0.", # protect -sist
  141. "tsi3>", # -ist > -
  142. "tt1.", # -tt > -t
  143. "uqi3.", # -iqu > -
  144. "ugo1.", # -ogu > -og
  145. "vis3j>", # -siv > -j
  146. "vie0.", # protect -eiv
  147. "vi2>", # -iv > -
  148. "ylb1>", # -bly > -bl
  149. "yli3y>", # -ily > -y
  150. "ylp0.", # protect -ply
  151. "yl2>", # -ly > -
  152. "ygo1.", # -ogy > -og
  153. "yhp1.", # -phy > -ph
  154. "ymo1.", # -omy > -om
  155. "ypo1.", # -opy > -op
  156. "yti3>", # -ity > -
  157. "yte3>", # -ety > -
  158. "ytl2.", # -lty > -l
  159. "yrtsi5.", # -istry > -
  160. "yra3>", # -ary > -
  161. "yro3>", # -ory > -
  162. "yfi3.", # -ify > -
  163. "ycn2t>", # -ncy > -nt
  164. "yca3>", # -acy > -
  165. "zi2>", # -iz > -
  166. "zy1s.", # -yz > -ys
  167. )
  168. def __init__(self, rule_tuple=None, strip_prefix_flag=False):
  169. """Create an instance of the Lancaster stemmer.
  170. """
  171. # Setup an empty rule dictionary - this will be filled in later
  172. self.rule_dictionary = {}
  173. # Check if a user wants to strip prefix
  174. self._strip_prefix = strip_prefix_flag
  175. # Check if a user wants to use his/her own rule tuples.
  176. self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
  177. def parseRules(self, rule_tuple=None):
  178. """Validate the set of rules used in this stemmer.
  179. If this function is called as an individual method, without using stem
  180. method, rule_tuple argument will be compiled into self.rule_dictionary.
  181. If this function is called within stem, self._rule_tuple will be used.
  182. """
  183. # If there is no argument for the function, use class' own rule tuple.
  184. rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
  185. valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
  186. # Empty any old rules from the rule set before adding new ones
  187. self.rule_dictionary = {}
  188. for rule in rule_tuple:
  189. if not valid_rule.match(rule):
  190. raise ValueError("The rule {0} is invalid".format(rule))
  191. first_letter = rule[0:1]
  192. if first_letter in self.rule_dictionary:
  193. self.rule_dictionary[first_letter].append(rule)
  194. else:
  195. self.rule_dictionary[first_letter] = [rule]
  196. def stem(self, word):
  197. """Stem a word using the Lancaster stemmer.
  198. """
  199. # Lower-case the word, since all the rules are lower-cased
  200. word = word.lower()
  201. word = self.__stripPrefix(word) if self._strip_prefix else word
  202. # Save a copy of the original word
  203. intact_word = word
  204. # If rule dictionary is empty, parse rule tuple.
  205. if not self.rule_dictionary:
  206. self.parseRules()
  207. return self.__doStemming(word, intact_word)
  208. def __doStemming(self, word, intact_word):
  209. """Perform the actual word stemming
  210. """
  211. valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
  212. proceed = True
  213. while proceed:
  214. # Find the position of the last letter of the word to be stemmed
  215. last_letter_position = self.__getLastLetter(word)
  216. # Only stem the word if it has a last letter and a rule matching that last letter
  217. if (
  218. last_letter_position < 0
  219. or word[last_letter_position] not in self.rule_dictionary
  220. ):
  221. proceed = False
  222. else:
  223. rule_was_applied = False
  224. # Go through each rule that matches the word's final letter
  225. for rule in self.rule_dictionary[word[last_letter_position]]:
  226. rule_match = valid_rule.match(rule)
  227. if rule_match:
  228. (
  229. ending_string,
  230. intact_flag,
  231. remove_total,
  232. append_string,
  233. cont_flag,
  234. ) = rule_match.groups()
  235. # Convert the number of chars to remove when stemming
  236. # from a string to an integer
  237. remove_total = int(remove_total)
  238. # Proceed if word's ending matches rule's word ending
  239. if word.endswith(ending_string[::-1]):
  240. if intact_flag:
  241. if word == intact_word and self.__isAcceptable(
  242. word, remove_total
  243. ):
  244. word = self.__applyRule(
  245. word, remove_total, append_string
  246. )
  247. rule_was_applied = True
  248. if cont_flag == '.':
  249. proceed = False
  250. break
  251. elif self.__isAcceptable(word, remove_total):
  252. word = self.__applyRule(
  253. word, remove_total, append_string
  254. )
  255. rule_was_applied = True
  256. if cont_flag == '.':
  257. proceed = False
  258. break
  259. # If no rules apply, the word doesn't need any more stemming
  260. if rule_was_applied == False:
  261. proceed = False
  262. return word
  263. def __getLastLetter(self, word):
  264. """Get the zero-based index of the last alphabetic character in this string
  265. """
  266. last_letter = -1
  267. for position in range(len(word)):
  268. if word[position].isalpha():
  269. last_letter = position
  270. else:
  271. break
  272. return last_letter
  273. def __isAcceptable(self, word, remove_total):
  274. """Determine if the word is acceptable for stemming.
  275. """
  276. word_is_acceptable = False
  277. # If the word starts with a vowel, it must be at least 2
  278. # characters long to be stemmed
  279. if word[0] in "aeiouy":
  280. if len(word) - remove_total >= 2:
  281. word_is_acceptable = True
  282. # If the word starts with a consonant, it must be at least 3
  283. # characters long (including one vowel) to be stemmed
  284. elif len(word) - remove_total >= 3:
  285. if word[1] in "aeiouy":
  286. word_is_acceptable = True
  287. elif word[2] in "aeiouy":
  288. word_is_acceptable = True
  289. return word_is_acceptable
  290. def __applyRule(self, word, remove_total, append_string):
  291. """Apply the stemming rule to the word
  292. """
  293. # Remove letters from the end of the word
  294. new_word_length = len(word) - remove_total
  295. word = word[0:new_word_length]
  296. # And add new letters to the end of the truncated word
  297. if append_string:
  298. word += append_string
  299. return word
  300. def __stripPrefix(self, word):
  301. """Remove prefix from a word.
  302. This function originally taken from Whoosh.
  303. """
  304. for prefix in (
  305. "kilo",
  306. "micro",
  307. "milli",
  308. "intra",
  309. "ultra",
  310. "mega",
  311. "nano",
  312. "pico",
  313. "pseudo",
  314. ):
  315. if word.startswith(prefix):
  316. return word[len(prefix) :]
  317. return word
  318. def __repr__(self):
  319. return '<LancasterStemmer>'