rslp.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: RSLP Stemmer
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Tiago Tresoldi <tresoldi@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. # This code is based on the algorithm presented in the paper "A Stemming
  9. # Algorithm for the Portuguese Language" by Viviane Moreira Orengo and
  10. # Christian Huyck, which unfortunately I had no access to. The code is a
  11. # Python version, with some minor modifications of mine, to the description
  12. # presented at http://www.webcitation.org/5NnvdIzOb and to the C source code
  13. # available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html.
  14. # Please note that this stemmer is intended for demonstration and educational
  15. # purposes only. Feel free to write me for any comments, including the
  16. # development of a different and/or better stemmer for Portuguese. I also
  17. # suggest using NLTK's mailing list for Portuguese for any discussion.
  18. # Este código é baseado no algoritmo apresentado no artigo "A Stemming
  19. # Algorithm for the Portuguese Language" de Viviane Moreira Orengo e
  20. # Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O
  21. # código é uma conversão para Python, com algumas pequenas modificações
  22. # minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do
  23. # código para linguagem C disponível em
  24. # http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor,
  25. # lembre-se de que este stemmer foi desenvolvido com finalidades unicamente
  26. # de demonstração e didáticas. Sinta-se livre para me escrever para qualquer
  27. # comentário, inclusive sobre o desenvolvimento de um stemmer diferente
  28. # e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão
  29. # do NLTK para o português para qualquer debate.
  30. from __future__ import print_function, unicode_literals
  31. from nltk.data import load
  32. from nltk.stem.api import StemmerI
  33. class RSLPStemmer(StemmerI):
  34. """
  35. A stemmer for Portuguese.
  36. >>> from nltk.stem import RSLPStemmer
  37. >>> st = RSLPStemmer()
  38. >>> # opening lines of Erico Verissimo's "Música ao Longe"
  39. >>> text = '''
  40. ... Clarissa risca com giz no quadro-negro a paisagem que os alunos
  41. ... devem copiar . Uma casinha de porta e janela , em cima duma
  42. ... coxilha .'''
  43. >>> for token in text.split():
  44. ... print(st.stem(token))
  45. clariss risc com giz no quadro-negr a pais que os alun dev copi .
  46. uma cas de port e janel , em cim dum coxilh .
  47. """
  48. def __init__(self):
  49. self._model = []
  50. self._model.append(self.read_rule("step0.pt"))
  51. self._model.append(self.read_rule("step1.pt"))
  52. self._model.append(self.read_rule("step2.pt"))
  53. self._model.append(self.read_rule("step3.pt"))
  54. self._model.append(self.read_rule("step4.pt"))
  55. self._model.append(self.read_rule("step5.pt"))
  56. self._model.append(self.read_rule("step6.pt"))
  57. def read_rule(self, filename):
  58. rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
  59. lines = rules.split("\n")
  60. lines = [line for line in lines if line != ""] # remove blank lines
  61. lines = [line for line in lines if line[0] != "#"] # remove comments
  62. # NOTE: a simple but ugly hack to make this parser happy with double '\t's
  63. lines = [line.replace("\t\t", "\t") for line in lines]
  64. # parse rules
  65. rules = []
  66. for line in lines:
  67. rule = []
  68. tokens = line.split("\t")
  69. # text to be searched for at the end of the string
  70. rule.append(tokens[0][1:-1]) # remove quotes
  71. # minimum stem size to perform the replacement
  72. rule.append(int(tokens[1]))
  73. # text to be replaced into
  74. rule.append(tokens[2][1:-1]) # remove quotes
  75. # exceptions to this rule
  76. rule.append([token[1:-1] for token in tokens[3].split(",")])
  77. # append to the results
  78. rules.append(rule)
  79. return rules
  80. def stem(self, word):
  81. word = word.lower()
  82. # the word ends in 's'? apply rule for plural reduction
  83. if word[-1] == "s":
  84. word = self.apply_rule(word, 0)
  85. # the word ends in 'a'? apply rule for feminine reduction
  86. if word[-1] == "a":
  87. word = self.apply_rule(word, 1)
  88. # augmentative reduction
  89. word = self.apply_rule(word, 3)
  90. # adverb reduction
  91. word = self.apply_rule(word, 2)
  92. # noun reduction
  93. prev_word = word
  94. word = self.apply_rule(word, 4)
  95. if word == prev_word:
  96. # verb reduction
  97. prev_word = word
  98. word = self.apply_rule(word, 5)
  99. if word == prev_word:
  100. # vowel removal
  101. word = self.apply_rule(word, 6)
  102. return word
  103. def apply_rule(self, word, rule_index):
  104. rules = self._model[rule_index]
  105. for rule in rules:
  106. suffix_length = len(rule[0])
  107. if word[-suffix_length:] == rule[0]: # if suffix matches
  108. if len(word) >= suffix_length + rule[1]: # if we have minimum size
  109. if word not in rule[3]: # if not an exception
  110. word = word[:-suffix_length] + rule[2]
  111. break
  112. return word