arlstem.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Natural Language Toolkit: ARLSTem Stemmer
  4. #
  5. # Copyright (C) 2001-2019 NLTK Project
  6. #
  7. # Author: Kheireddine Abainia (x-programer) <k.abainia@gmail.com>
  8. # Algorithms: Kheireddine Abainia <k.abainia@gmail.com>
  9. # Siham Ouamour
  10. # Halim Sayoud
  11. # URL: <http://nltk.org/>
  12. # For license information, see LICENSE.TXT
  13. """
  14. ARLSTem Arabic Stemmer
  15. The details about the implementation of this algorithm are described in:
  16. K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer ,
  17. Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17),
  18. Vol. 29, No. 3, 2017, pp. 557-573.
  19. The ARLSTem is a light Arabic stemmer that is based on removing the affixes
  20. from the word (i.e. prefixes, suffixes and infixes). It was evaluated and
  21. compared to several other stemmers using Paice's parameters (under-stemming
  22. index, over-stemming index and stemming weight), and the results showed that
  23. ARLSTem is promising and producing high performances. This stemmer is not
  24. based on any dictionary and can be used on-line effectively.
  25. """
  26. from __future__ import unicode_literals
  27. import re
  28. from nltk.stem.api import StemmerI
  29. class ARLSTem(StemmerI):
  30. '''
  31. ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary.
  32. Department of Telecommunication & Information Processing. USTHB University,
  33. Algiers, Algeria.
  34. ARLSTem.stem(token) returns the Arabic stem for the input token.
  35. The ARLSTem Stemmer requires that all tokens are encoded using Unicode
  36. encoding.
  37. '''
  38. def __init__(self):
  39. # different Alif with hamza
  40. self.re_hamzated_alif = re.compile(r'[\u0622\u0623\u0625]')
  41. self.re_alifMaqsura = re.compile(r'[\u0649]')
  42. self.re_diacritics = re.compile(r'[\u064B-\u065F]')
  43. # Alif Laam, Laam Laam, Fa Laam, Fa Ba
  44. self.pr2 = ['\u0627\u0644', '\u0644\u0644', '\u0641\u0644', '\u0641\u0628']
  45. # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam
  46. self.pr3 = ['\u0628\u0627\u0644', '\u0643\u0627\u0644', '\u0648\u0627\u0644']
  47. # Fa Laam Laam, Waaw Laam Laam
  48. self.pr32 = ['\u0641\u0644\u0644', '\u0648\u0644\u0644']
  49. # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam
  50. self.pr4 = [
  51. '\u0641\u0628\u0627\u0644',
  52. '\u0648\u0628\u0627\u0644',
  53. '\u0641\u0643\u0627\u0644',
  54. ]
  55. # Kaf Yaa, Kaf Miim
  56. self.su2 = ['\u0643\u064A', '\u0643\u0645']
  57. # Ha Alif, Ha Miim
  58. self.su22 = ['\u0647\u0627', '\u0647\u0645']
  59. # Kaf Miim Alif, Kaf Noon Shadda
  60. self.su3 = ['\u0643\u0645\u0627', '\u0643\u0646\u0651']
  61. # Ha Miim Alif, Ha Noon Shadda
  62. self.su32 = ['\u0647\u0645\u0627', '\u0647\u0646\u0651']
  63. # Alif Noon, Ya Noon, Waaw Noon
  64. self.pl_si2 = ['\u0627\u0646', '\u064A\u0646', '\u0648\u0646']
  65. # Taa Alif Noon, Taa Ya Noon
  66. self.pl_si3 = ['\u062A\u0627\u0646', '\u062A\u064A\u0646']
  67. # Alif Noon, Waaw Noon
  68. self.verb_su2 = ['\u0627\u0646', '\u0648\u0646']
  69. # Siin Taa, Siin Yaa
  70. self.verb_pr2 = ['\u0633\u062A', '\u0633\u064A']
  71. # Siin Alif, Siin Noon
  72. self.verb_pr22 = ['\u0633\u0627', '\u0633\u0646']
  73. # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza
  74. self.verb_pr33 = [
  75. '\u0644\u0646',
  76. '\u0644\u062A',
  77. '\u0644\u064A',
  78. '\u0644\u0623',
  79. ]
  80. # Taa Miim Alif, Taa Noon Shadda
  81. self.verb_suf3 = ['\u062A\u0645\u0627', '\u062A\u0646\u0651']
  82. # Noon Alif, Taa Miim, Taa Alif, Waaw Alif
  83. self.verb_suf2 = [
  84. '\u0646\u0627',
  85. '\u062A\u0645',
  86. '\u062A\u0627',
  87. '\u0648\u0627',
  88. ]
  89. # Taa, Alif, Noon
  90. self.verb_suf1 = ['\u062A', '\u0627', '\u0646']
  91. def stem(self, token):
  92. """
  93. call this function to get the word's stem based on ARLSTem .
  94. """
  95. try:
  96. if token is None:
  97. raise ValueError(
  98. "The word could not be stemmed, because \
  99. it is empty !"
  100. )
  101. # remove Arabic diacritics and replace some letters with others
  102. token = self.norm(token)
  103. # strip common prefixes of the nouns
  104. pre = self.pref(token)
  105. if pre is not None:
  106. token = pre
  107. # strip the suffixes which are common to nouns and verbs
  108. token = self.suff(token)
  109. # transform a plural noun to a singular noun
  110. ps = self.plur2sing(token)
  111. if ps is None:
  112. # transform from the feminine form to the masculine form
  113. fm = self.fem2masc(token)
  114. if fm is not None:
  115. return fm
  116. else:
  117. if pre is None: # if the prefixes are not stripped
  118. # strip the verb prefixes and suffixes
  119. return self.verb(token)
  120. else:
  121. return ps
  122. return token
  123. except ValueError as e:
  124. print(e)
  125. def norm(self, token):
  126. """
  127. normalize the word by removing diacritics, replacing hamzated Alif
  128. with Alif replacing AlifMaqsura with Yaa and removing Waaw at the
  129. beginning.
  130. """
  131. # strip Arabic diacritics
  132. token = self.re_diacritics.sub('', token)
  133. # replace Hamzated Alif with Alif bare
  134. token = self.re_hamzated_alif.sub('\u0627', token)
  135. # replace alifMaqsura with Yaa
  136. token = self.re_alifMaqsura.sub('\u064A', token)
  137. # strip the Waaw from the word beginning if the remaining is 3 letters
  138. # at least
  139. if token.startswith('\u0648') and len(token) > 3:
  140. token = token[1:]
  141. return token
  142. def pref(self, token):
  143. """
  144. remove prefixes from the words' beginning.
  145. """
  146. if len(token) > 5:
  147. for p3 in self.pr3:
  148. if token.startswith(p3):
  149. return token[3:]
  150. if len(token) > 6:
  151. for p4 in self.pr4:
  152. if token.startswith(p4):
  153. return token[4:]
  154. if len(token) > 5:
  155. for p3 in self.pr32:
  156. if token.startswith(p3):
  157. return token[3:]
  158. if len(token) > 4:
  159. for p2 in self.pr2:
  160. if token.startswith(p2):
  161. return token[2:]
  162. def suff(self, token):
  163. """
  164. remove suffixes from the word's end.
  165. """
  166. if token.endswith('\u0643') and len(token) > 3:
  167. return token[:-1]
  168. if len(token) > 4:
  169. for s2 in self.su2:
  170. if token.endswith(s2):
  171. return token[:-2]
  172. if len(token) > 5:
  173. for s3 in self.su3:
  174. if token.endswith(s3):
  175. return token[:-3]
  176. if token.endswith('\u0647') and len(token) > 3:
  177. token = token[:-1]
  178. return token
  179. if len(token) > 4:
  180. for s2 in self.su22:
  181. if token.endswith(s2):
  182. return token[:-2]
  183. if len(token) > 5:
  184. for s3 in self.su32:
  185. if token.endswith(s3):
  186. return token[:-3]
  187. if token.endswith('\u0646\u0627') and len(token) > 4:
  188. return token[:-2]
  189. return token
  190. def fem2masc(self, token):
  191. """
  192. transform the word from the feminine form to the masculine form.
  193. """
  194. if token.endswith('\u0629') and len(token) > 3:
  195. return token[:-1]
  196. def plur2sing(self, token):
  197. """
  198. transform the word from the plural form to the singular form.
  199. """
  200. if len(token) > 4:
  201. for ps2 in self.pl_si2:
  202. if token.endswith(ps2):
  203. return token[:-2]
  204. if len(token) > 5:
  205. for ps3 in self.pl_si3:
  206. if token.endswith(ps3):
  207. return token[:-3]
  208. if len(token) > 3 and token.endswith('\u0627\u062A'):
  209. return token[:-2]
  210. if len(token) > 3 and token.startswith('\u0627') and token[2] == '\u0627':
  211. return token[:2] + token[3:]
  212. if len(token) > 4 and token.startswith('\u0627') and token[-2] == '\u0627':
  213. return token[1:-2] + token[-1]
  214. def verb(self, token):
  215. """
  216. stem the verb prefixes and suffixes or both
  217. """
  218. vb = self.verb_t1(token)
  219. if vb is not None:
  220. return vb
  221. vb = self.verb_t2(token)
  222. if vb is not None:
  223. return vb
  224. vb = self.verb_t3(token)
  225. if vb is not None:
  226. return vb
  227. vb = self.verb_t4(token)
  228. if vb is not None:
  229. return vb
  230. vb = self.verb_t5(token)
  231. if vb is not None:
  232. return vb
  233. return self.verb_t6(token)
  234. def verb_t1(self, token):
  235. """
  236. stem the present prefixes and suffixes
  237. """
  238. if len(token) > 5 and token.startswith('\u062A'): # Taa
  239. for s2 in self.pl_si2:
  240. if token.endswith(s2):
  241. return token[1:-2]
  242. if len(token) > 5 and token.startswith('\u064A'): # Yaa
  243. for s2 in self.verb_su2:
  244. if token.endswith(s2):
  245. return token[1:-2]
  246. if len(token) > 4 and token.startswith('\u0627'): # Alif
  247. # Waaw Alif
  248. if len(token) > 5 and token.endswith('\u0648\u0627'):
  249. return token[1:-2]
  250. # Yaa
  251. if token.endswith('\u064A'):
  252. return token[1:-1]
  253. # Alif
  254. if token.endswith('\u0627'):
  255. return token[1:-1]
  256. # Noon
  257. if token.endswith('\u0646'):
  258. return token[1:-1]
  259. # ^Yaa, Noon$
  260. if len(token) > 4 and token.startswith('\u064A') and token.endswith('\u0646'):
  261. return token[1:-1]
  262. # ^Taa, Noon$
  263. if len(token) > 4 and token.startswith('\u062A') and token.endswith('\u0646'):
  264. return token[1:-1]
  265. def verb_t2(self, token):
  266. """
  267. stem the future prefixes and suffixes
  268. """
  269. if len(token) > 6:
  270. for s2 in self.pl_si2:
  271. # ^Siin Taa
  272. if token.startswith(self.verb_pr2[0]) and token.endswith(s2):
  273. return token[2:-2]
  274. # ^Siin Yaa, Alif Noon$
  275. if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]):
  276. return token[2:-2]
  277. # ^Siin Yaa, Waaw Noon$
  278. if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]):
  279. return token[2:-2]
  280. # ^Siin Taa, Noon$
  281. if (
  282. len(token) > 5
  283. and token.startswith(self.verb_pr2[0])
  284. and token.endswith('\u0646')
  285. ):
  286. return token[2:-1]
  287. # ^Siin Yaa, Noon$
  288. if (
  289. len(token) > 5
  290. and token.startswith(self.verb_pr2[1])
  291. and token.endswith('\u0646')
  292. ):
  293. return token[2:-1]
  294. def verb_t3(self, token):
  295. """
  296. stem the present suffixes
  297. """
  298. if len(token) > 5:
  299. for su3 in self.verb_suf3:
  300. if token.endswith(su3):
  301. return token[:-3]
  302. if len(token) > 4:
  303. for su2 in self.verb_suf2:
  304. if token.endswith(su2):
  305. return token[:-2]
  306. if len(token) > 3:
  307. for su1 in self.verb_suf1:
  308. if token.endswith(su1):
  309. return token[:-1]
  310. def verb_t4(self, token):
  311. """
  312. stem the present prefixes
  313. """
  314. if len(token) > 3:
  315. for pr1 in self.verb_suf1:
  316. if token.startswith(pr1):
  317. return token[1:]
  318. if token.startswith('\u064A'):
  319. return token[1:]
  320. def verb_t5(self, token):
  321. """
  322. stem the future prefixes
  323. """
  324. if len(token) > 4:
  325. for pr2 in self.verb_pr22:
  326. if token.startswith(pr2):
  327. return token[2:]
  328. for pr2 in self.verb_pr2:
  329. if token.startswith(pr2):
  330. return token[2:]
  331. return token
  332. def verb_t6(self, token):
  333. """
  334. stem the order prefixes
  335. """
  336. if len(token) > 4:
  337. for pr3 in self.verb_pr33:
  338. if token.startswith(pr3):
  339. return token[2:]
  340. return token