123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- # Natural Language Toolkit: Stemmers
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Steven Tomcavage <stomcava@law.upenn.edu>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
- Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
- """
- from __future__ import unicode_literals
- import re
- from nltk.stem.api import StemmerI
- from nltk.compat import python_2_unicode_compatible
- @python_2_unicode_compatible
- class LancasterStemmer(StemmerI):
- """
- Lancaster Stemmer
- >>> from nltk.stem.lancaster import LancasterStemmer
- >>> st = LancasterStemmer()
- >>> st.stem('maximum') # Remove "-um" when word is intact
- 'maxim'
- >>> st.stem('presumably') # Don't remove "-um" when word is not intact
- 'presum'
- >>> st.stem('multiply') # No action taken if word ends with "-ply"
- 'multiply'
- >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules
- 'provid'
- >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters
- 'ow'
- >>> st.stem('ear') # ditto
- 'ear'
- >>> st.stem('saying') # Words starting with consonant must contain at least 3
- 'say'
- >>> st.stem('crying') # letters and one of those letters must be a vowel
- 'cry'
- >>> st.stem('string') # ditto
- 'string'
- >>> st.stem('meant') # ditto
- 'meant'
- >>> st.stem('cement') # ditto
- 'cem'
- >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
- >>> st_pre.stem('kilometer') # Test Prefix
- 'met'
- >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
- >>> st_custom.stem("ness") # Change s to t
- 'nest'
- """
- # The rule list is static since it doesn't change between instances
- default_rule_tuple = (
- "ai*2.", # -ia > - if intact
- "a*1.", # -a > - if intact
- "bb1.", # -bb > -b
- "city3s.", # -ytic > -ys
- "ci2>", # -ic > -
- "cn1t>", # -nc > -nt
- "dd1.", # -dd > -d
- "dei3y>", # -ied > -y
- "deec2ss.", # -ceed >", -cess
- "dee1.", # -eed > -ee
- "de2>", # -ed > -
- "dooh4>", # -hood > -
- "e1>", # -e > -
- "feil1v.", # -lief > -liev
- "fi2>", # -if > -
- "gni3>", # -ing > -
- "gai3y.", # -iag > -y
- "ga2>", # -ag > -
- "gg1.", # -gg > -g
- "ht*2.", # -th > - if intact
- "hsiug5ct.", # -guish > -ct
- "hsi3>", # -ish > -
- "i*1.", # -i > - if intact
- "i1y>", # -i > -y
- "ji1d.", # -ij > -id -- see nois4j> & vis3j>
- "juf1s.", # -fuj > -fus
- "ju1d.", # -uj > -ud
- "jo1d.", # -oj > -od
- "jeh1r.", # -hej > -her
- "jrev1t.", # -verj > -vert
- "jsim2t.", # -misj > -mit
- "jn1d.", # -nj > -nd
- "j1s.", # -j > -s
- "lbaifi6.", # -ifiabl > -
- "lbai4y.", # -iabl > -y
- "lba3>", # -abl > -
- "lbi3.", # -ibl > -
- "lib2l>", # -bil > -bl
- "lc1.", # -cl > c
- "lufi4y.", # -iful > -y
- "luf3>", # -ful > -
- "lu2.", # -ul > -
- "lai3>", # -ial > -
- "lau3>", # -ual > -
- "la2>", # -al > -
- "ll1.", # -ll > -l
- "mui3.", # -ium > -
- "mu*2.", # -um > - if intact
- "msi3>", # -ism > -
- "mm1.", # -mm > -m
- "nois4j>", # -sion > -j
- "noix4ct.", # -xion > -ct
- "noi3>", # -ion > -
- "nai3>", # -ian > -
- "na2>", # -an > -
- "nee0.", # protect -een
- "ne2>", # -en > -
- "nn1.", # -nn > -n
- "pihs4>", # -ship > -
- "pp1.", # -pp > -p
- "re2>", # -er > -
- "rae0.", # protect -ear
- "ra2.", # -ar > -
- "ro2>", # -or > -
- "ru2>", # -ur > -
- "rr1.", # -rr > -r
- "rt1>", # -tr > -t
- "rei3y>", # -ier > -y
- "sei3y>", # -ies > -y
- "sis2.", # -sis > -s
- "si2>", # -is > -
- "ssen4>", # -ness > -
- "ss0.", # protect -ss
- "suo3>", # -ous > -
- "su*2.", # -us > - if intact
- "s*1>", # -s > - if intact
- "s0.", # -s > -s
- "tacilp4y.", # -plicat > -ply
- "ta2>", # -at > -
- "tnem4>", # -ment > -
- "tne3>", # -ent > -
- "tna3>", # -ant > -
- "tpir2b.", # -ript > -rib
- "tpro2b.", # -orpt > -orb
- "tcud1.", # -duct > -duc
- "tpmus2.", # -sumpt > -sum
- "tpec2iv.", # -cept > -ceiv
- "tulo2v.", # -olut > -olv
- "tsis0.", # protect -sist
- "tsi3>", # -ist > -
- "tt1.", # -tt > -t
- "uqi3.", # -iqu > -
- "ugo1.", # -ogu > -og
- "vis3j>", # -siv > -j
- "vie0.", # protect -eiv
- "vi2>", # -iv > -
- "ylb1>", # -bly > -bl
- "yli3y>", # -ily > -y
- "ylp0.", # protect -ply
- "yl2>", # -ly > -
- "ygo1.", # -ogy > -og
- "yhp1.", # -phy > -ph
- "ymo1.", # -omy > -om
- "ypo1.", # -opy > -op
- "yti3>", # -ity > -
- "yte3>", # -ety > -
- "ytl2.", # -lty > -l
- "yrtsi5.", # -istry > -
- "yra3>", # -ary > -
- "yro3>", # -ory > -
- "yfi3.", # -ify > -
- "ycn2t>", # -ncy > -nt
- "yca3>", # -acy > -
- "zi2>", # -iz > -
- "zy1s.", # -yz > -ys
- )
- def __init__(self, rule_tuple=None, strip_prefix_flag=False):
- """Create an instance of the Lancaster stemmer.
- """
- # Setup an empty rule dictionary - this will be filled in later
- self.rule_dictionary = {}
- # Check if a user wants to strip prefix
- self._strip_prefix = strip_prefix_flag
- # Check if a user wants to use his/her own rule tuples.
- self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple
- def parseRules(self, rule_tuple=None):
- """Validate the set of rules used in this stemmer.
- If this function is called as an individual method, without using stem
- method, rule_tuple argument will be compiled into self.rule_dictionary.
- If this function is called within stem, self._rule_tuple will be used.
- """
- # If there is no argument for the function, use class' own rule tuple.
- rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
- valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
- # Empty any old rules from the rule set before adding new ones
- self.rule_dictionary = {}
- for rule in rule_tuple:
- if not valid_rule.match(rule):
- raise ValueError("The rule {0} is invalid".format(rule))
- first_letter = rule[0:1]
- if first_letter in self.rule_dictionary:
- self.rule_dictionary[first_letter].append(rule)
- else:
- self.rule_dictionary[first_letter] = [rule]
- def stem(self, word):
- """Stem a word using the Lancaster stemmer.
- """
- # Lower-case the word, since all the rules are lower-cased
- word = word.lower()
- word = self.__stripPrefix(word) if self._strip_prefix else word
- # Save a copy of the original word
- intact_word = word
- # If rule dictionary is empty, parse rule tuple.
- if not self.rule_dictionary:
- self.parseRules()
- return self.__doStemming(word, intact_word)
- def __doStemming(self, word, intact_word):
- """Perform the actual word stemming
- """
- valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")
- proceed = True
- while proceed:
- # Find the position of the last letter of the word to be stemmed
- last_letter_position = self.__getLastLetter(word)
- # Only stem the word if it has a last letter and a rule matching that last letter
- if (
- last_letter_position < 0
- or word[last_letter_position] not in self.rule_dictionary
- ):
- proceed = False
- else:
- rule_was_applied = False
- # Go through each rule that matches the word's final letter
- for rule in self.rule_dictionary[word[last_letter_position]]:
- rule_match = valid_rule.match(rule)
- if rule_match:
- (
- ending_string,
- intact_flag,
- remove_total,
- append_string,
- cont_flag,
- ) = rule_match.groups()
- # Convert the number of chars to remove when stemming
- # from a string to an integer
- remove_total = int(remove_total)
- # Proceed if word's ending matches rule's word ending
- if word.endswith(ending_string[::-1]):
- if intact_flag:
- if word == intact_word and self.__isAcceptable(
- word, remove_total
- ):
- word = self.__applyRule(
- word, remove_total, append_string
- )
- rule_was_applied = True
- if cont_flag == '.':
- proceed = False
- break
- elif self.__isAcceptable(word, remove_total):
- word = self.__applyRule(
- word, remove_total, append_string
- )
- rule_was_applied = True
- if cont_flag == '.':
- proceed = False
- break
- # If no rules apply, the word doesn't need any more stemming
- if rule_was_applied == False:
- proceed = False
- return word
- def __getLastLetter(self, word):
- """Get the zero-based index of the last alphabetic character in this string
- """
- last_letter = -1
- for position in range(len(word)):
- if word[position].isalpha():
- last_letter = position
- else:
- break
- return last_letter
- def __isAcceptable(self, word, remove_total):
- """Determine if the word is acceptable for stemming.
- """
- word_is_acceptable = False
- # If the word starts with a vowel, it must be at least 2
- # characters long to be stemmed
- if word[0] in "aeiouy":
- if len(word) - remove_total >= 2:
- word_is_acceptable = True
- # If the word starts with a consonant, it must be at least 3
- # characters long (including one vowel) to be stemmed
- elif len(word) - remove_total >= 3:
- if word[1] in "aeiouy":
- word_is_acceptable = True
- elif word[2] in "aeiouy":
- word_is_acceptable = True
- return word_is_acceptable
- def __applyRule(self, word, remove_total, append_string):
- """Apply the stemming rule to the word
- """
- # Remove letters from the end of the word
- new_word_length = len(word) - remove_total
- word = word[0:new_word_length]
- # And add new letters to the end of the truncated word
- if append_string:
- word += append_string
- return word
- def __stripPrefix(self, word):
- """Remove prefix from a word.
- This function originally taken from Whoosh.
- """
- for prefix in (
- "kilo",
- "micro",
- "milli",
- "intra",
- "ultra",
- "mega",
- "nano",
- "pico",
- "pseudo",
- ):
- if word.startswith(prefix):
- return word[len(prefix) :]
- return word
- def __repr__(self):
- return '<LancasterStemmer>'
|