snowball.py 174 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Natural Language Toolkit: Snowball Stemmer
  4. #
  5. # Copyright (C) 2001-2019 NLTK Project
  6. # Author: Peter Michael Stahl <pemistahl@gmail.com>
  7. # Peter Ljunglof <peter.ljunglof@heatherleaf.se> (revisions)
  8. # Lakhdar Benzahia <lakhdar.benzahia@gmail.com> (co-writer)
  9. # Assem Chelli <assem.ch@gmail.com> (reviewer arabicstemmer)
  10. # Abdelkrim Aries <ab_aries@esi.dz> (reviewer arabicstemmer)
  11. # Algorithms: Dr Martin Porter <martin@tartarus.org>
  12. # Assem Chelli <assem.ch@gmail.com> arabic stemming algorithm
  13. # Benzahia Lakhdar <lakhdar.benzahia@gmail.com>
  14. # URL: <http://nltk.org/>
  15. # For license information, see LICENSE.TXT
  16. """
  17. Snowball stemmers
  18. This module provides a port of the Snowball stemmers
  19. developed by Martin Porter.
  20. There is also a demo function: `snowball.demo()`.
  21. """
  22. from __future__ import unicode_literals, print_function
  23. import re
  24. from six.moves import input
  25. from nltk import compat
  26. from nltk.corpus import stopwords
  27. from nltk.stem import porter
  28. from nltk.stem.util import suffix_replace, prefix_replace
  29. from nltk.stem.api import StemmerI
  30. class SnowballStemmer(StemmerI):
  31. """
  32. Snowball Stemmer
  33. The following languages are supported:
  34. Arabic, Danish, Dutch, English, Finnish, French, German,
  35. Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
  36. Spanish and Swedish.
  37. The algorithm for English is documented here:
  38. Porter, M. \"An algorithm for suffix stripping.\"
  39. Program 14.3 (1980): 130-137.
  40. The algorithms have been developed by Martin Porter.
  41. These stemmers are called Snowball, because Porter created
  42. a programming language with this name for creating
  43. new stemming algorithms. There is more information available
  44. at http://snowball.tartarus.org/
  45. The stemmer is invoked as shown below:
  46. >>> from nltk.stem import SnowballStemmer
  47. >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
  48. arabic danish dutch english finnish french german hungarian
  49. italian norwegian porter portuguese romanian russian
  50. spanish swedish
  51. >>> stemmer = SnowballStemmer("german") # Choose a language
  52. >>> stemmer.stem("Autobahnen") # Stem a word
  53. 'autobahn'
  54. Invoking the stemmers that way is useful if you do not know the
  55. language to be stemmed at runtime. Alternatively, if you already know
  56. the language, then you can invoke the language specific stemmer directly:
  57. >>> from nltk.stem.snowball import GermanStemmer
  58. >>> stemmer = GermanStemmer()
  59. >>> stemmer.stem("Autobahnen")
  60. 'autobahn'
  61. :param language: The language whose subclass is instantiated.
  62. :type language: str or unicode
  63. :param ignore_stopwords: If set to True, stopwords are
  64. not stemmed and returned unchanged.
  65. Set to False by default.
  66. :type ignore_stopwords: bool
  67. :raise ValueError: If there is no stemmer for the specified
  68. language, a ValueError is raised.
  69. """
  70. languages = (
  71. "arabic",
  72. "danish",
  73. "dutch",
  74. "english",
  75. "finnish",
  76. "french",
  77. "german",
  78. "hungarian",
  79. "italian",
  80. "norwegian",
  81. "porter",
  82. "portuguese",
  83. "romanian",
  84. "russian",
  85. "spanish",
  86. "swedish",
  87. )
  88. def __init__(self, language, ignore_stopwords=False):
  89. if language not in self.languages:
  90. raise ValueError("The language '{0}' is not supported.".format(language))
  91. stemmerclass = globals()[language.capitalize() + "Stemmer"]
  92. self.stemmer = stemmerclass(ignore_stopwords)
  93. self.stem = self.stemmer.stem
  94. self.stopwords = self.stemmer.stopwords
  95. def stem(self, token):
  96. return self.stemmer.stem(self, token)
  97. @compat.python_2_unicode_compatible
  98. class _LanguageSpecificStemmer(StemmerI):
  99. """
  100. This helper subclass offers the possibility
  101. to invoke a specific stemmer directly.
  102. This is useful if you already know the language to be stemmed at runtime.
  103. Create an instance of the Snowball stemmer.
  104. :param ignore_stopwords: If set to True, stopwords are
  105. not stemmed and returned unchanged.
  106. Set to False by default.
  107. :type ignore_stopwords: bool
  108. """
  109. def __init__(self, ignore_stopwords=False):
  110. # The language is the name of the class, minus the final "Stemmer".
  111. language = type(self).__name__.lower()
  112. if language.endswith("stemmer"):
  113. language = language[:-7]
  114. self.stopwords = set()
  115. if ignore_stopwords:
  116. try:
  117. for word in stopwords.words(language):
  118. self.stopwords.add(word)
  119. except IOError:
  120. raise ValueError(
  121. "{!r} has no list of stopwords. Please set"
  122. " 'ignore_stopwords' to 'False'.".format(self)
  123. )
  124. def __repr__(self):
  125. """
  126. Print out the string representation of the respective class.
  127. """
  128. return "<{0}>".format(type(self).__name__)
  129. class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer):
  130. """
  131. A word stemmer based on the original Porter stemming algorithm.
  132. Porter, M. \"An algorithm for suffix stripping.\"
  133. Program 14.3 (1980): 130-137.
  134. A few minor modifications have been made to Porter's basic
  135. algorithm. See the source code of the module
  136. nltk.stem.porter for more information.
  137. """
  138. def __init__(self, ignore_stopwords=False):
  139. _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
  140. porter.PorterStemmer.__init__(self)
  141. class _ScandinavianStemmer(_LanguageSpecificStemmer):
  142. """
  143. This subclass encapsulates a method for defining the string region R1.
  144. It is used by the Danish, Norwegian, and Swedish stemmer.
  145. """
  146. def _r1_scandinavian(self, word, vowels):
  147. """
  148. Return the region R1 that is used by the Scandinavian stemmers.
  149. R1 is the region after the first non-vowel following a vowel,
  150. or is the null region at the end of the word if there is no
  151. such non-vowel. But then R1 is adjusted so that the region
  152. before it contains at least three letters.
  153. :param word: The word whose region R1 is determined.
  154. :type word: str or unicode
  155. :param vowels: The vowels of the respective language that are
  156. used to determine the region R1.
  157. :type vowels: unicode
  158. :return: the region R1 for the respective word.
  159. :rtype: unicode
  160. :note: This helper method is invoked by the respective stem method of
  161. the subclasses DanishStemmer, NorwegianStemmer, and
  162. SwedishStemmer. It is not to be invoked directly!
  163. """
  164. r1 = ""
  165. for i in range(1, len(word)):
  166. if word[i] not in vowels and word[i - 1] in vowels:
  167. if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
  168. r1 = word[3:]
  169. elif len(word[: i + 1]) >= 3:
  170. r1 = word[i + 1 :]
  171. else:
  172. return word
  173. break
  174. return r1
  175. class _StandardStemmer(_LanguageSpecificStemmer):
  176. """
  177. This subclass encapsulates two methods for defining the standard versions
  178. of the string regions R1, R2, and RV.
  179. """
  180. def _r1r2_standard(self, word, vowels):
  181. """
  182. Return the standard interpretations of the string regions R1 and R2.
  183. R1 is the region after the first non-vowel following a vowel,
  184. or is the null region at the end of the word if there is no
  185. such non-vowel.
  186. R2 is the region after the first non-vowel following a vowel
  187. in R1, or is the null region at the end of the word if there
  188. is no such non-vowel.
  189. :param word: The word whose regions R1 and R2 are determined.
  190. :type word: str or unicode
  191. :param vowels: The vowels of the respective language that are
  192. used to determine the regions R1 and R2.
  193. :type vowels: unicode
  194. :return: (r1,r2), the regions R1 and R2 for the respective word.
  195. :rtype: tuple
  196. :note: This helper method is invoked by the respective stem method of
  197. the subclasses DutchStemmer, FinnishStemmer,
  198. FrenchStemmer, GermanStemmer, ItalianStemmer,
  199. PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
  200. It is not to be invoked directly!
  201. :note: A detailed description of how to define R1 and R2
  202. can be found at http://snowball.tartarus.org/texts/r1r2.html
  203. """
  204. r1 = ""
  205. r2 = ""
  206. for i in range(1, len(word)):
  207. if word[i] not in vowels and word[i - 1] in vowels:
  208. r1 = word[i + 1 :]
  209. break
  210. for i in range(1, len(r1)):
  211. if r1[i] not in vowels and r1[i - 1] in vowels:
  212. r2 = r1[i + 1 :]
  213. break
  214. return (r1, r2)
  215. def _rv_standard(self, word, vowels):
  216. """
  217. Return the standard interpretation of the string region RV.
  218. If the second letter is a consonant, RV is the region after the
  219. next following vowel. If the first two letters are vowels, RV is
  220. the region after the next following consonant. Otherwise, RV is
  221. the region after the third letter.
  222. :param word: The word whose region RV is determined.
  223. :type word: str or unicode
  224. :param vowels: The vowels of the respective language that are
  225. used to determine the region RV.
  226. :type vowels: unicode
  227. :return: the region RV for the respective word.
  228. :rtype: unicode
  229. :note: This helper method is invoked by the respective stem method of
  230. the subclasses ItalianStemmer, PortugueseStemmer,
  231. RomanianStemmer, and SpanishStemmer. It is not to be
  232. invoked directly!
  233. """
  234. rv = ""
  235. if len(word) >= 2:
  236. if word[1] not in vowels:
  237. for i in range(2, len(word)):
  238. if word[i] in vowels:
  239. rv = word[i + 1 :]
  240. break
  241. elif word[0] in vowels and word[1] in vowels:
  242. for i in range(2, len(word)):
  243. if word[i] not in vowels:
  244. rv = word[i + 1 :]
  245. break
  246. else:
  247. rv = word[3:]
  248. return rv
  249. class ArabicStemmer(_StandardStemmer):
  250. """
  251. https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
  252. The Snowball Arabic light Stemmer
  253. Algorithm : Assem Chelli
  254. Abdelkrim Aries
  255. Lakhdar Benzahia
  256. Nltk Version Author : Lakhdar Benzahia
  257. """
  258. # Normalize_pre stes
  259. __vocalization = re.compile(
  260. r'[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]'
  261. ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ
  262. __kasheeda = re.compile(r'[\u0640]') # ـ tatweel/kasheeda
  263. __arabic_punctuation_marks = re.compile(r'[\u060C-\u061B-\u061F]') # ؛ ، ؟
  264. # Normalize_post
  265. __last_hamzat = ('\u0623', '\u0625', '\u0622', '\u0624', '\u0626') # أ، إ، آ، ؤ، ئ
  266. # normalize other hamza's
  267. __initial_hamzat = re.compile(r'^[\u0622\u0623\u0625]') # أ، إ، آ
  268. __waw_hamza = re.compile(r'[\u0624]') # ؤ
  269. __yeh_hamza = re.compile(r'[\u0626]') # ئ
  270. __alefat = re.compile(r'[\u0623\u0622\u0625]') # أ، إ، آ
  271. # Checks
  272. __checks1 = (
  273. '\u0643\u0627\u0644',
  274. '\u0628\u0627\u0644', # بال، كال
  275. '\u0627\u0644',
  276. '\u0644\u0644', # لل، ال
  277. )
  278. __checks2 = ('\u0629', '\u0627\u062a') # ة # female plural ات
  279. # Suffixes
  280. __suffix_noun_step1a = (
  281. '\u064a',
  282. '\u0643',
  283. '\u0647', # ي، ك، ه
  284. '\u0646\u0627',
  285. '\u0643\u0645',
  286. '\u0647\u0627',
  287. '\u0647\u0646',
  288. '\u0647\u0645', # نا، كم، ها، هن، هم
  289. '\u0643\u0645\u0627',
  290. '\u0647\u0645\u0627', # كما، هما
  291. )
  292. __suffix_noun_step1b = '\u0646' # ن
  293. __suffix_noun_step2a = ('\u0627', '\u064a', '\u0648') # ا، ي، و
  294. __suffix_noun_step2b = '\u0627\u062a' # ات
  295. __suffix_noun_step2c1 = '\u062a' # ت
  296. __suffix_noun_step2c2 = '\u0629' # ة
  297. __suffix_noun_step3 = '\u064a' # ي
  298. __suffix_verb_step1 = (
  299. '\u0647',
  300. '\u0643', # ه، ك
  301. '\u0646\u064a',
  302. '\u0646\u0627',
  303. '\u0647\u0627',
  304. '\u0647\u0645', # ني، نا، ها، هم
  305. '\u0647\u0646',
  306. '\u0643\u0645',
  307. '\u0643\u0646', # هن، كم، كن
  308. '\u0647\u0645\u0627',
  309. '\u0643\u0645\u0627',
  310. '\u0643\u0645\u0648', # هما، كما، كمو
  311. )
  312. __suffix_verb_step2a = (
  313. '\u062a',
  314. '\u0627',
  315. '\u0646',
  316. '\u064a', # ت، ا، ن، ي
  317. '\u0646\u0627',
  318. '\u062a\u0627',
  319. '\u062a\u0646', # نا، تا، تن Past
  320. '\u0627\u0646',
  321. '\u0648\u0646',
  322. '\u064a\u0646', # ان، هن، ين Present
  323. '\u062a\u0645\u0627', # تما
  324. )
  325. __suffix_verb_step2b = ('\u0648\u0627', '\u062a\u0645') # وا، تم
  326. __suffix_verb_step2c = ('\u0648', '\u062a\u0645\u0648') # و # تمو
  327. __suffix_all_alef_maqsura = '\u0649' # ى
  328. # Prefixes
  329. __prefix_step1 = (
  330. '\u0623', # أ
  331. '\u0623\u0623',
  332. '\u0623\u0622',
  333. '\u0623\u0624',
  334. '\u0623\u0627',
  335. '\u0623\u0625', # أأ، أآ، أؤ، أا، أإ
  336. )
  337. __prefix_step2a = ('\u0641\u0627\u0644', '\u0648\u0627\u0644') # فال، وال
  338. __prefix_step2b = ('\u0641', '\u0648') # ف، و
  339. __prefix_step3a_noun = (
  340. '\u0627\u0644',
  341. '\u0644\u0644', # لل، ال
  342. '\u0643\u0627\u0644',
  343. '\u0628\u0627\u0644', # بال، كال
  344. )
  345. __prefix_step3b_noun = (
  346. '\u0628',
  347. '\u0643',
  348. '\u0644', # ب، ك، ل
  349. '\u0628\u0628',
  350. '\u0643\u0643', # بب، كك
  351. )
  352. __prefix_step3_verb = (
  353. '\u0633\u064a',
  354. '\u0633\u062a',
  355. '\u0633\u0646',
  356. '\u0633\u0623',
  357. ) # سي، ست، سن، سأ
  358. __prefix_step4_verb = (
  359. '\u064a\u0633\u062a',
  360. '\u0646\u0633\u062a',
  361. '\u062a\u0633\u062a',
  362. ) # يست، نست، تست
  363. # Suffixes added due to Conjugation Verbs
  364. __conjugation_suffix_verb_1 = ('\u0647', '\u0643') # ه، ك
  365. __conjugation_suffix_verb_2 = (
  366. '\u0646\u064a',
  367. '\u0646\u0627',
  368. '\u0647\u0627', # ني، نا، ها
  369. '\u0647\u0645',
  370. '\u0647\u0646',
  371. '\u0643\u0645', # هم، هن، كم
  372. '\u0643\u0646', # كن
  373. )
  374. __conjugation_suffix_verb_3 = (
  375. '\u0647\u0645\u0627',
  376. '\u0643\u0645\u0627',
  377. '\u0643\u0645\u0648',
  378. ) # هما، كما، كمو
  379. __conjugation_suffix_verb_4 = ('\u0627', '\u0646', '\u064a') # ا، ن، ي
  380. __conjugation_suffix_verb_past = (
  381. '\u0646\u0627',
  382. '\u062a\u0627',
  383. '\u062a\u0646',
  384. ) # نا، تا، تن
  385. __conjugation_suffix_verb_present = (
  386. '\u0627\u0646',
  387. '\u0648\u0646',
  388. '\u064a\u0646',
  389. ) # ان، ون، ين
  390. # Suffixes added due to derivation Names
  391. __conjugation_suffix_noun_1 = ('\u064a', '\u0643', '\u0647') # ي، ك، ه
  392. __conjugation_suffix_noun_2 = (
  393. '\u0646\u0627',
  394. '\u0643\u0645', # نا، كم
  395. '\u0647\u0627',
  396. '\u0647\u0646',
  397. '\u0647\u0645', # ها، هن، هم
  398. )
  399. __conjugation_suffix_noun_3 = (
  400. '\u0643\u0645\u0627',
  401. '\u0647\u0645\u0627',
  402. ) # كما، هما
  403. # Prefixes added due to derivation Names
  404. __prefixes1 = ('\u0648\u0627', '\u0641\u0627') # فا، وا
  405. __articles_3len = ('\u0643\u0627\u0644', '\u0628\u0627\u0644') # بال كال
  406. __articles_2len = ('\u0627\u0644', '\u0644\u0644') # ال لل
  407. # Prepositions letters
  408. __prepositions1 = ('\u0643', '\u0644') # ك، ل
  409. __prepositions2 = ('\u0628\u0628', '\u0643\u0643') # بب، كك
  410. is_verb = True
  411. is_noun = True
  412. is_defined = False
  413. suffixes_verb_step1_success = False
  414. suffix_verb_step2a_success = False
  415. suffix_verb_step2b_success = False
  416. suffix_noun_step2c2_success = False
  417. suffix_noun_step1a_success = False
  418. suffix_noun_step2a_success = False
  419. suffix_noun_step2b_success = False
  420. suffixe_noun_step1b_success = False
  421. prefix_step2a_success = False
  422. prefix_step3a_noun_success = False
  423. prefix_step3b_noun_success = False
  424. def __normalize_pre(self, token):
  425. """
  426. :param token: string
  427. :return: normalized token type string
  428. """
  429. # strip diacritics
  430. token = self.__vocalization.sub('', token)
  431. # strip kasheeda
  432. token = self.__kasheeda.sub('', token)
  433. # strip punctuation marks
  434. token = self.__arabic_punctuation_marks.sub('', token)
  435. return token
  436. def __normalize_post(self, token):
  437. # normalize last hamza
  438. for hamza in self.__last_hamzat:
  439. if token.endswith(hamza):
  440. token = suffix_replace(token, hamza, '\u0621')
  441. break
  442. # normalize other hamzat
  443. token = self.__initial_hamzat.sub('\u0627', token)
  444. token = self.__waw_hamza.sub('\u0648', token)
  445. token = self.__yeh_hamza.sub('\u064a', token)
  446. token = self.__alefat.sub('\u0627', token)
  447. return token
  448. def __checks_1(self, token):
  449. for prefix in self.__checks1:
  450. if token.startswith(prefix):
  451. if prefix in self.__articles_3len and len(token) > 4:
  452. self.is_noun = True
  453. self.is_verb = False
  454. self.is_defined = True
  455. break
  456. if prefix in self.__articles_2len and len(token) > 3:
  457. self.is_noun = True
  458. self.is_verb = False
  459. self.is_defined = True
  460. break
  461. def __checks_2(self, token):
  462. for suffix in self.__checks2:
  463. if token.endswith(suffix):
  464. if suffix == '\u0629' and len(token) > 2:
  465. self.is_noun = True
  466. self.is_verb = False
  467. break
  468. if suffix == '\u0627\u062a' and len(token) > 3:
  469. self.is_noun = True
  470. self.is_verb = False
  471. break
  472. def __Suffix_Verb_Step1(self, token):
  473. for suffix in self.__suffix_verb_step1:
  474. if token.endswith(suffix):
  475. if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4:
  476. token = token[:-1]
  477. self.suffixes_verb_step1_success = True
  478. break
  479. if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5:
  480. token = token[:-2]
  481. self.suffixes_verb_step1_success = True
  482. break
  483. if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6:
  484. token = token[:-3]
  485. self.suffixes_verb_step1_success = True
  486. break
  487. return token
  488. def __Suffix_Verb_Step2a(self, token):
  489. for suffix in self.__suffix_verb_step2a:
  490. if token.endswith(suffix) and len(token) > 3:
  491. if suffix == '\u062a' and len(token) >= 4:
  492. token = token[:-1]
  493. self.suffix_verb_step2a_success = True
  494. break
  495. if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4:
  496. token = token[:-1]
  497. self.suffix_verb_step2a_success = True
  498. break
  499. if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5:
  500. token = token[:-2] # past
  501. self.suffix_verb_step2a_success = True
  502. break
  503. if suffix in self.__conjugation_suffix_verb_present and len(token) > 5:
  504. token = token[:-2] # present
  505. self.suffix_verb_step2a_success = True
  506. break
  507. if suffix == '\u062a\u0645\u0627' and len(token) >= 6:
  508. token = token[:-3]
  509. self.suffix_verb_step2a_success = True
  510. break
  511. return token
  512. def __Suffix_Verb_Step2c(self, token):
  513. for suffix in self.__suffix_verb_step2c:
  514. if token.endswith(suffix):
  515. if suffix == '\u062a\u0645\u0648' and len(token) >= 6:
  516. token = token[:-3]
  517. break
  518. if suffix == '\u0648' and len(token) >= 4:
  519. token = token[:-1]
  520. break
  521. return token
  522. def __Suffix_Verb_Step2b(self, token):
  523. for suffix in self.__suffix_verb_step2b:
  524. if token.endswith(suffix) and len(token) >= 5:
  525. token = token[:-2]
  526. self.suffix_verb_step2b_success = True
  527. break
  528. return token
  529. def __Suffix_Noun_Step2c2(self, token):
  530. for suffix in self.__suffix_noun_step2c2:
  531. if token.endswith(suffix) and len(token) >= 3:
  532. token = token[:-1]
  533. self.suffix_noun_step2c2_success = True
  534. break
  535. return token
  536. def __Suffix_Noun_Step1a(self, token):
  537. for suffix in self.__suffix_noun_step1a:
  538. if token.endswith(suffix):
  539. if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4:
  540. token = token[:-1]
  541. self.suffix_noun_step1a_success = True
  542. break
  543. if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5:
  544. token = token[:-2]
  545. self.suffix_noun_step1a_success = True
  546. break
  547. if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6:
  548. token = token[:-3]
  549. self.suffix_noun_step1a_success = True
  550. break
  551. return token
  552. def __Suffix_Noun_Step2a(self, token):
  553. for suffix in self.__suffix_noun_step2a:
  554. if token.endswith(suffix) and len(token) > 4:
  555. token = token[:-1]
  556. self.suffix_noun_step2a_success = True
  557. break
  558. return token
  559. def __Suffix_Noun_Step2b(self, token):
  560. for suffix in self.__suffix_noun_step2b:
  561. if token.endswith(suffix) and len(token) >= 5:
  562. token = token[:-2]
  563. self.suffix_noun_step2b_success = True
  564. break
  565. return token
  566. def __Suffix_Noun_Step2c1(self, token):
  567. for suffix in self.__suffix_noun_step2c1:
  568. if token.endswith(suffix) and len(token) >= 4:
  569. token = token[:-1]
  570. break
  571. return token
  572. def __Suffix_Noun_Step1b(self, token):
  573. for suffix in self.__suffix_noun_step1b:
  574. if token.endswith(suffix) and len(token) > 5:
  575. token = token[:-1]
  576. self.suffixe_noun_step1b_success = True
  577. break
  578. return token
  579. def __Suffix_Noun_Step3(self, token):
  580. for suffix in self.__suffix_noun_step3:
  581. if token.endswith(suffix) and len(token) >= 3:
  582. token = token[:-1] # ya' nisbiya
  583. break
  584. return token
  585. def __Suffix_All_alef_maqsura(self, token):
  586. for suffix in self.__suffix_all_alef_maqsura:
  587. if token.endswith(suffix):
  588. token = suffix_replace(token, suffix, '\u064a')
  589. return token
  590. def __Prefix_Step1(self, token):
  591. for prefix in self.__prefix_step1:
  592. if token.startswith(prefix) and len(token) > 3:
  593. if prefix == '\u0623\u0623':
  594. token = prefix_replace(token, prefix, '\u0623')
  595. break
  596. elif prefix == '\u0623\u0622':
  597. token = prefix_replace(token, prefix, '\u0622')
  598. break
  599. elif prefix == '\u0623\u0624':
  600. token = prefix_replace(token, prefix, '\u0624')
  601. break
  602. elif prefix == '\u0623\u0627':
  603. token = prefix_replace(token, prefix, '\u0627')
  604. break
  605. elif prefix == '\u0623\u0625':
  606. token = prefix_replace(token, prefix, '\u0625')
  607. break
  608. return token
  609. def __Prefix_Step2a(self, token):
  610. for prefix in self.__prefix_step2a:
  611. if token.startswith(prefix) and len(token) > 5:
  612. token = token[len(prefix) :]
  613. self.prefix_step2a_success = True
  614. break
  615. return token
  616. def __Prefix_Step2b(self, token):
  617. for prefix in self.__prefix_step2b:
  618. if token.startswith(prefix) and len(token) > 3:
  619. if token[:2] not in self.__prefixes1:
  620. token = token[len(prefix) :]
  621. break
  622. return token
  623. def __Prefix_Step3a_Noun(self, token):
  624. for prefix in self.__prefix_step3a_noun:
  625. if token.startswith(prefix):
  626. if prefix in self.__articles_2len and len(token) > 4:
  627. token = token[len(prefix) :]
  628. self.prefix_step3a_noun_success = True
  629. break
  630. if prefix in self.__articles_3len and len(token) > 5:
  631. token = token[len(prefix) :]
  632. break
  633. return token
  634. def __Prefix_Step3b_Noun(self, token):
  635. for prefix in self.__prefix_step3b_noun:
  636. if token.startswith(prefix):
  637. if len(token) > 3:
  638. if prefix == '\u0628':
  639. token = token[len(prefix) :]
  640. self.prefix_step3b_noun_success = True
  641. break
  642. if prefix in self.__prepositions2:
  643. token = prefix_replace(token, prefix, prefix[1])
  644. self.prefix_step3b_noun_success = True
  645. break
  646. if prefix in self.__prepositions1 and len(token) > 4:
  647. token = token[len(prefix) :] # BUG: cause confusion
  648. self.prefix_step3b_noun_success = True
  649. break
  650. return token
  651. def __Prefix_Step3_Verb(self, token):
  652. for prefix in self.__prefix_step3_verb:
  653. if token.startswith(prefix) and len(token) > 4:
  654. token = prefix_replace(token, prefix, prefix[1])
  655. break
  656. return token
  657. def __Prefix_Step4_Verb(self, token):
  658. for prefix in self.__prefix_step4_verb:
  659. if token.startswith(prefix) and len(token) > 4:
  660. token = prefix_replace(token, prefix, '\u0627\u0633\u062a')
  661. self.is_verb = True
  662. self.is_noun = False
  663. break
  664. return token
  665. def stem(self, word):
  666. """
  667. Stem an Arabic word and return the stemmed form.
  668. :param word: string
  669. :return: string
  670. """
  671. # set initial values
  672. self.is_verb = True
  673. self.is_noun = True
  674. self.is_defined = False
  675. self.suffix_verb_step2a_success = False
  676. self.suffix_verb_step2b_success = False
  677. self.suffix_noun_step2c2_success = False
  678. self.suffix_noun_step1a_success = False
  679. self.suffix_noun_step2a_success = False
  680. self.suffix_noun_step2b_success = False
  681. self.suffixe_noun_step1b_success = False
  682. self.prefix_step2a_success = False
  683. self.prefix_step3a_noun_success = False
  684. self.prefix_step3b_noun_success = False
  685. modified_word = word
  686. # guess type and properties
  687. # checks1
  688. self.__checks_1(modified_word)
  689. # checks2
  690. self.__checks_2(modified_word)
  691. # Pre_Normalization
  692. modified_word = self.__normalize_pre(modified_word)
  693. # Avoid stopwords
  694. if modified_word in self.stopwords or len(modified_word) <= 2:
  695. return modified_word
  696. # Start stemming
  697. if self.is_verb:
  698. modified_word = self.__Suffix_Verb_Step1(modified_word)
  699. if self.suffixes_verb_step1_success:
  700. modified_word = self.__Suffix_Verb_Step2a(modified_word)
  701. if not self.suffix_verb_step2a_success:
  702. modified_word = self.__Suffix_Verb_Step2c(modified_word)
  703. # or next TODO: How to deal with or next instruction
  704. else:
  705. modified_word = self.__Suffix_Verb_Step2b(modified_word)
  706. if not self.suffix_verb_step2b_success:
  707. modified_word = self.__Suffix_Verb_Step2a(modified_word)
  708. if self.is_noun:
  709. modified_word = self.__Suffix_Noun_Step2c2(modified_word)
  710. if not self.suffix_noun_step2c2_success:
  711. if not self.is_defined:
  712. modified_word = self.__Suffix_Noun_Step1a(modified_word)
  713. # if self.suffix_noun_step1a_success:
  714. modified_word = self.__Suffix_Noun_Step2a(modified_word)
  715. if not self.suffix_noun_step2a_success:
  716. modified_word = self.__Suffix_Noun_Step2b(modified_word)
  717. if (
  718. not self.suffix_noun_step2b_success
  719. and not self.suffix_noun_step2a_success
  720. ):
  721. modified_word = self.__Suffix_Noun_Step2c1(modified_word)
  722. # or next ? todo : how to deal with or next
  723. else:
  724. modified_word = self.__Suffix_Noun_Step1b(modified_word)
  725. if self.suffixe_noun_step1b_success:
  726. modified_word = self.__Suffix_Noun_Step2a(modified_word)
  727. if not self.suffix_noun_step2a_success:
  728. modified_word = self.__Suffix_Noun_Step2b(modified_word)
  729. if (
  730. not self.suffix_noun_step2b_success
  731. and not self.suffix_noun_step2a_success
  732. ):
  733. modified_word = self.__Suffix_Noun_Step2c1(modified_word)
  734. else:
  735. if not self.is_defined:
  736. modified_word = self.__Suffix_Noun_Step2a(modified_word)
  737. modified_word = self.__Suffix_Noun_Step2b(modified_word)
  738. modified_word = self.__Suffix_Noun_Step3(modified_word)
  739. if not self.is_noun and self.is_verb:
  740. modified_word = self.__Suffix_All_alef_maqsura(modified_word)
  741. # prefixes
  742. modified_word = self.__Prefix_Step1(modified_word)
  743. modified_word = self.__Prefix_Step2a(modified_word)
  744. if not self.prefix_step2a_success:
  745. modified_word = self.__Prefix_Step2b(modified_word)
  746. modified_word = self.__Prefix_Step3a_Noun(modified_word)
  747. if not self.prefix_step3a_noun_success and self.is_noun:
  748. modified_word = self.__Prefix_Step3b_Noun(modified_word)
  749. else:
  750. if not self.prefix_step3b_noun_success and self.is_verb:
  751. modified_word = self.__Prefix_Step3_Verb(modified_word)
  752. modified_word = self.__Prefix_Step4_Verb(modified_word)
  753. # post normalization stemming
  754. modified_word = self.__normalize_post(modified_word)
  755. stemmed_word = modified_word
  756. return stemmed_word
  757. class DanishStemmer(_ScandinavianStemmer):
  758. """
  759. The Danish Snowball stemmer.
  760. :cvar __vowels: The Danish vowels.
  761. :type __vowels: unicode
  762. :cvar __consonants: The Danish consonants.
  763. :type __consonants: unicode
  764. :cvar __double_consonants: The Danish double consonants.
  765. :type __double_consonants: tuple
  766. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  767. :type __s_ending: unicode
  768. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  769. :type __step1_suffixes: tuple
  770. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  771. :type __step2_suffixes: tuple
  772. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  773. :type __step3_suffixes: tuple
  774. :note: A detailed description of the Danish
  775. stemming algorithm can be found under
  776. http://snowball.tartarus.org/algorithms/danish/stemmer.html
  777. """
  778. # The language's vowels and other important characters are defined.
  779. __vowels = "aeiouy\xE6\xE5\xF8"
  780. __consonants = "bcdfghjklmnpqrstvwxz"
  781. __double_consonants = (
  782. "bb",
  783. "cc",
  784. "dd",
  785. "ff",
  786. "gg",
  787. "hh",
  788. "jj",
  789. "kk",
  790. "ll",
  791. "mm",
  792. "nn",
  793. "pp",
  794. "qq",
  795. "rr",
  796. "ss",
  797. "tt",
  798. "vv",
  799. "ww",
  800. "xx",
  801. "zz",
  802. )
  803. __s_ending = "abcdfghjklmnoprtvyz\xE5"
  804. # The different suffixes, divided into the algorithm's steps
  805. # and organized by length, are listed in tuples.
  806. __step1_suffixes = (
  807. "erendes",
  808. "erende",
  809. "hedens",
  810. "ethed",
  811. "erede",
  812. "heden",
  813. "heder",
  814. "endes",
  815. "ernes",
  816. "erens",
  817. "erets",
  818. "ered",
  819. "ende",
  820. "erne",
  821. "eren",
  822. "erer",
  823. "heds",
  824. "enes",
  825. "eres",
  826. "eret",
  827. "hed",
  828. "ene",
  829. "ere",
  830. "ens",
  831. "ers",
  832. "ets",
  833. "en",
  834. "er",
  835. "es",
  836. "et",
  837. "e",
  838. "s",
  839. )
  840. __step2_suffixes = ("gd", "dt", "gt", "kt")
  841. __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig")
  842. def stem(self, word):
  843. """
  844. Stem a Danish word and return the stemmed form.
  845. :param word: The word that is stemmed.
  846. :type word: str or unicode
  847. :return: The stemmed form.
  848. :rtype: unicode
  849. """
  850. # Every word is put into lower case for normalization.
  851. word = word.lower()
  852. if word in self.stopwords:
  853. return word
  854. # After this, the required regions are generated
  855. # by the respective helper method.
  856. r1 = self._r1_scandinavian(word, self.__vowels)
  857. # Then the actual stemming process starts.
  858. # Every new step is explicitly indicated
  859. # according to the descriptions on the Snowball website.
  860. # STEP 1
  861. for suffix in self.__step1_suffixes:
  862. if r1.endswith(suffix):
  863. if suffix == "s":
  864. if word[-2] in self.__s_ending:
  865. word = word[:-1]
  866. r1 = r1[:-1]
  867. else:
  868. word = word[: -len(suffix)]
  869. r1 = r1[: -len(suffix)]
  870. break
  871. # STEP 2
  872. for suffix in self.__step2_suffixes:
  873. if r1.endswith(suffix):
  874. word = word[:-1]
  875. r1 = r1[:-1]
  876. break
  877. # STEP 3
  878. if r1.endswith("igst"):
  879. word = word[:-2]
  880. r1 = r1[:-2]
  881. for suffix in self.__step3_suffixes:
  882. if r1.endswith(suffix):
  883. if suffix == "l\xF8st":
  884. word = word[:-1]
  885. r1 = r1[:-1]
  886. else:
  887. word = word[: -len(suffix)]
  888. r1 = r1[: -len(suffix)]
  889. if r1.endswith(self.__step2_suffixes):
  890. word = word[:-1]
  891. r1 = r1[:-1]
  892. break
  893. # STEP 4: Undouble
  894. for double_cons in self.__double_consonants:
  895. if word.endswith(double_cons) and len(word) > 3:
  896. word = word[:-1]
  897. break
  898. return word
  899. class DutchStemmer(_StandardStemmer):
  900. """
  901. The Dutch Snowball stemmer.
  902. :cvar __vowels: The Dutch vowels.
  903. :type __vowels: unicode
  904. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  905. :type __step1_suffixes: tuple
  906. :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
  907. :type __step3b_suffixes: tuple
  908. :note: A detailed description of the Dutch
  909. stemming algorithm can be found under
  910. http://snowball.tartarus.org/algorithms/dutch/stemmer.html
  911. """
  912. __vowels = "aeiouy\xE8"
  913. __step1_suffixes = ("heden", "ene", "en", "se", "s")
  914. __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")
  915. def stem(self, word):
  916. """
  917. Stem a Dutch word and return the stemmed form.
  918. :param word: The word that is stemmed.
  919. :type word: str or unicode
  920. :return: The stemmed form.
  921. :rtype: unicode
  922. """
  923. word = word.lower()
  924. if word in self.stopwords:
  925. return word
  926. step2_success = False
  927. # Vowel accents are removed.
  928. word = (
  929. word.replace("\xE4", "a")
  930. .replace("\xE1", "a")
  931. .replace("\xEB", "e")
  932. .replace("\xE9", "e")
  933. .replace("\xED", "i")
  934. .replace("\xEF", "i")
  935. .replace("\xF6", "o")
  936. .replace("\xF3", "o")
  937. .replace("\xFC", "u")
  938. .replace("\xFA", "u")
  939. )
  940. # An initial 'y', a 'y' after a vowel,
  941. # and an 'i' between self.__vowels is put into upper case.
  942. # As from now these are treated as consonants.
  943. if word.startswith("y"):
  944. word = "".join(("Y", word[1:]))
  945. for i in range(1, len(word)):
  946. if word[i - 1] in self.__vowels and word[i] == "y":
  947. word = "".join((word[:i], "Y", word[i + 1 :]))
  948. for i in range(1, len(word) - 1):
  949. if (
  950. word[i - 1] in self.__vowels
  951. and word[i] == "i"
  952. and word[i + 1] in self.__vowels
  953. ):
  954. word = "".join((word[:i], "I", word[i + 1 :]))
  955. r1, r2 = self._r1r2_standard(word, self.__vowels)
  956. # R1 is adjusted so that the region before it
  957. # contains at least 3 letters.
  958. for i in range(1, len(word)):
  959. if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
  960. if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
  961. r1 = word[3:]
  962. elif len(word[: i + 1]) == 0:
  963. return word
  964. break
  965. # STEP 1
  966. for suffix in self.__step1_suffixes:
  967. if r1.endswith(suffix):
  968. if suffix == "heden":
  969. word = suffix_replace(word, suffix, "heid")
  970. r1 = suffix_replace(r1, suffix, "heid")
  971. if r2.endswith("heden"):
  972. r2 = suffix_replace(r2, suffix, "heid")
  973. elif (
  974. suffix in ("ene", "en")
  975. and not word.endswith("heden")
  976. and word[-len(suffix) - 1] not in self.__vowels
  977. and word[-len(suffix) - 3 : -len(suffix)] != "gem"
  978. ):
  979. word = word[: -len(suffix)]
  980. r1 = r1[: -len(suffix)]
  981. r2 = r2[: -len(suffix)]
  982. if word.endswith(("kk", "dd", "tt")):
  983. word = word[:-1]
  984. r1 = r1[:-1]
  985. r2 = r2[:-1]
  986. elif (
  987. suffix in ("se", "s")
  988. and word[-len(suffix) - 1] not in self.__vowels
  989. and word[-len(suffix) - 1] != "j"
  990. ):
  991. word = word[: -len(suffix)]
  992. r1 = r1[: -len(suffix)]
  993. r2 = r2[: -len(suffix)]
  994. break
  995. # STEP 2
  996. if r1.endswith("e") and word[-2] not in self.__vowels:
  997. step2_success = True
  998. word = word[:-1]
  999. r1 = r1[:-1]
  1000. r2 = r2[:-1]
  1001. if word.endswith(("kk", "dd", "tt")):
  1002. word = word[:-1]
  1003. r1 = r1[:-1]
  1004. r2 = r2[:-1]
  1005. # STEP 3a
  1006. if r2.endswith("heid") and word[-5] != "c":
  1007. word = word[:-4]
  1008. r1 = r1[:-4]
  1009. r2 = r2[:-4]
  1010. if (
  1011. r1.endswith("en")
  1012. and word[-3] not in self.__vowels
  1013. and word[-5:-2] != "gem"
  1014. ):
  1015. word = word[:-2]
  1016. r1 = r1[:-2]
  1017. r2 = r2[:-2]
  1018. if word.endswith(("kk", "dd", "tt")):
  1019. word = word[:-1]
  1020. r1 = r1[:-1]
  1021. r2 = r2[:-1]
  1022. # STEP 3b: Derivational suffixes
  1023. for suffix in self.__step3b_suffixes:
  1024. if r2.endswith(suffix):
  1025. if suffix in ("end", "ing"):
  1026. word = word[:-3]
  1027. r2 = r2[:-3]
  1028. if r2.endswith("ig") and word[-3] != "e":
  1029. word = word[:-2]
  1030. else:
  1031. if word.endswith(("kk", "dd", "tt")):
  1032. word = word[:-1]
  1033. elif suffix == "ig" and word[-3] != "e":
  1034. word = word[:-2]
  1035. elif suffix == "lijk":
  1036. word = word[:-4]
  1037. r1 = r1[:-4]
  1038. if r1.endswith("e") and word[-2] not in self.__vowels:
  1039. word = word[:-1]
  1040. if word.endswith(("kk", "dd", "tt")):
  1041. word = word[:-1]
  1042. elif suffix == "baar":
  1043. word = word[:-4]
  1044. elif suffix == "bar" and step2_success:
  1045. word = word[:-3]
  1046. break
  1047. # STEP 4: Undouble vowel
  1048. if len(word) >= 4:
  1049. if word[-1] not in self.__vowels and word[-1] != "I":
  1050. if word[-3:-1] in ("aa", "ee", "oo", "uu"):
  1051. if word[-4] not in self.__vowels:
  1052. word = "".join((word[:-3], word[-3], word[-1]))
  1053. # All occurrences of 'I' and 'Y' are put back into lower case.
  1054. word = word.replace("I", "i").replace("Y", "y")
  1055. return word
  1056. class EnglishStemmer(_StandardStemmer):
  1057. """
  1058. The English Snowball stemmer.
  1059. :cvar __vowels: The English vowels.
  1060. :type __vowels: unicode
  1061. :cvar __double_consonants: The English double consonants.
  1062. :type __double_consonants: tuple
  1063. :cvar __li_ending: Letters that may directly appear before a word final 'li'.
  1064. :type __li_ending: unicode
  1065. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  1066. :type __step0_suffixes: tuple
  1067. :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
  1068. :type __step1a_suffixes: tuple
  1069. :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
  1070. :type __step1b_suffixes: tuple
  1071. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  1072. :type __step2_suffixes: tuple
  1073. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  1074. :type __step3_suffixes: tuple
  1075. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  1076. :type __step4_suffixes: tuple
  1077. :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
  1078. :type __step5_suffixes: tuple
  1079. :cvar __special_words: A dictionary containing words
  1080. which have to be stemmed specially.
  1081. :type __special_words: dict
  1082. :note: A detailed description of the English
  1083. stemming algorithm can be found under
  1084. http://snowball.tartarus.org/algorithms/english/stemmer.html
  1085. """
  1086. __vowels = "aeiouy"
  1087. __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
  1088. __li_ending = "cdeghkmnrt"
  1089. __step0_suffixes = ("'s'", "'s", "'")
  1090. __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
  1091. __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
  1092. __step2_suffixes = (
  1093. 'ization',
  1094. 'ational',
  1095. 'fulness',
  1096. 'ousness',
  1097. 'iveness',
  1098. 'tional',
  1099. 'biliti',
  1100. 'lessli',
  1101. 'entli',
  1102. 'ation',
  1103. 'alism',
  1104. 'aliti',
  1105. 'ousli',
  1106. 'iviti',
  1107. 'fulli',
  1108. 'enci',
  1109. 'anci',
  1110. 'abli',
  1111. 'izer',
  1112. 'ator',
  1113. 'alli',
  1114. 'bli',
  1115. 'ogi',
  1116. 'li',
  1117. )
  1118. __step3_suffixes = (
  1119. 'ational',
  1120. 'tional',
  1121. 'alize',
  1122. 'icate',
  1123. 'iciti',
  1124. 'ative',
  1125. 'ical',
  1126. 'ness',
  1127. 'ful',
  1128. )
  1129. __step4_suffixes = (
  1130. 'ement',
  1131. 'ance',
  1132. 'ence',
  1133. 'able',
  1134. 'ible',
  1135. 'ment',
  1136. 'ant',
  1137. 'ent',
  1138. 'ism',
  1139. 'ate',
  1140. 'iti',
  1141. 'ous',
  1142. 'ive',
  1143. 'ize',
  1144. 'ion',
  1145. 'al',
  1146. 'er',
  1147. 'ic',
  1148. )
  1149. __step5_suffixes = ("e", "l")
  1150. __special_words = {
  1151. "skis": "ski",
  1152. "skies": "sky",
  1153. "dying": "die",
  1154. "lying": "lie",
  1155. "tying": "tie",
  1156. "idly": "idl",
  1157. "gently": "gentl",
  1158. "ugly": "ugli",
  1159. "early": "earli",
  1160. "only": "onli",
  1161. "singly": "singl",
  1162. "sky": "sky",
  1163. "news": "news",
  1164. "howe": "howe",
  1165. "atlas": "atlas",
  1166. "cosmos": "cosmos",
  1167. "bias": "bias",
  1168. "andes": "andes",
  1169. "inning": "inning",
  1170. "innings": "inning",
  1171. "outing": "outing",
  1172. "outings": "outing",
  1173. "canning": "canning",
  1174. "cannings": "canning",
  1175. "herring": "herring",
  1176. "herrings": "herring",
  1177. "earring": "earring",
  1178. "earrings": "earring",
  1179. "proceed": "proceed",
  1180. "proceeds": "proceed",
  1181. "proceeded": "proceed",
  1182. "proceeding": "proceed",
  1183. "exceed": "exceed",
  1184. "exceeds": "exceed",
  1185. "exceeded": "exceed",
  1186. "exceeding": "exceed",
  1187. "succeed": "succeed",
  1188. "succeeds": "succeed",
  1189. "succeeded": "succeed",
  1190. "succeeding": "succeed",
  1191. }
  1192. def stem(self, word):
  1193. """
  1194. Stem an English word and return the stemmed form.
  1195. :param word: The word that is stemmed.
  1196. :type word: str or unicode
  1197. :return: The stemmed form.
  1198. :rtype: unicode
  1199. """
  1200. word = word.lower()
  1201. if word in self.stopwords or len(word) <= 2:
  1202. return word
  1203. elif word in self.__special_words:
  1204. return self.__special_words[word]
  1205. # Map the different apostrophe characters to a single consistent one
  1206. word = (
  1207. word.replace("\u2019", "\x27")
  1208. .replace("\u2018", "\x27")
  1209. .replace("\u201B", "\x27")
  1210. )
  1211. if word.startswith("\x27"):
  1212. word = word[1:]
  1213. if word.startswith("y"):
  1214. word = "".join(("Y", word[1:]))
  1215. for i in range(1, len(word)):
  1216. if word[i - 1] in self.__vowels and word[i] == "y":
  1217. word = "".join((word[:i], "Y", word[i + 1 :]))
  1218. step1a_vowel_found = False
  1219. step1b_vowel_found = False
  1220. r1 = ""
  1221. r2 = ""
  1222. if word.startswith(("gener", "commun", "arsen")):
  1223. if word.startswith(("gener", "arsen")):
  1224. r1 = word[5:]
  1225. else:
  1226. r1 = word[6:]
  1227. for i in range(1, len(r1)):
  1228. if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
  1229. r2 = r1[i + 1 :]
  1230. break
  1231. else:
  1232. r1, r2 = self._r1r2_standard(word, self.__vowels)
  1233. # STEP 0
  1234. for suffix in self.__step0_suffixes:
  1235. if word.endswith(suffix):
  1236. word = word[: -len(suffix)]
  1237. r1 = r1[: -len(suffix)]
  1238. r2 = r2[: -len(suffix)]
  1239. break
  1240. # STEP 1a
  1241. for suffix in self.__step1a_suffixes:
  1242. if word.endswith(suffix):
  1243. if suffix == "sses":
  1244. word = word[:-2]
  1245. r1 = r1[:-2]
  1246. r2 = r2[:-2]
  1247. elif suffix in ("ied", "ies"):
  1248. if len(word[: -len(suffix)]) > 1:
  1249. word = word[:-2]
  1250. r1 = r1[:-2]
  1251. r2 = r2[:-2]
  1252. else:
  1253. word = word[:-1]
  1254. r1 = r1[:-1]
  1255. r2 = r2[:-1]
  1256. elif suffix == "s":
  1257. for letter in word[:-2]:
  1258. if letter in self.__vowels:
  1259. step1a_vowel_found = True
  1260. break
  1261. if step1a_vowel_found:
  1262. word = word[:-1]
  1263. r1 = r1[:-1]
  1264. r2 = r2[:-1]
  1265. break
  1266. # STEP 1b
  1267. for suffix in self.__step1b_suffixes:
  1268. if word.endswith(suffix):
  1269. if suffix in ("eed", "eedly"):
  1270. if r1.endswith(suffix):
  1271. word = suffix_replace(word, suffix, "ee")
  1272. if len(r1) >= len(suffix):
  1273. r1 = suffix_replace(r1, suffix, "ee")
  1274. else:
  1275. r1 = ""
  1276. if len(r2) >= len(suffix):
  1277. r2 = suffix_replace(r2, suffix, "ee")
  1278. else:
  1279. r2 = ""
  1280. else:
  1281. for letter in word[: -len(suffix)]:
  1282. if letter in self.__vowels:
  1283. step1b_vowel_found = True
  1284. break
  1285. if step1b_vowel_found:
  1286. word = word[: -len(suffix)]
  1287. r1 = r1[: -len(suffix)]
  1288. r2 = r2[: -len(suffix)]
  1289. if word.endswith(("at", "bl", "iz")):
  1290. word = "".join((word, "e"))
  1291. r1 = "".join((r1, "e"))
  1292. if len(word) > 5 or len(r1) >= 3:
  1293. r2 = "".join((r2, "e"))
  1294. elif word.endswith(self.__double_consonants):
  1295. word = word[:-1]
  1296. r1 = r1[:-1]
  1297. r2 = r2[:-1]
  1298. elif (
  1299. r1 == ""
  1300. and len(word) >= 3
  1301. and word[-1] not in self.__vowels
  1302. and word[-1] not in "wxY"
  1303. and word[-2] in self.__vowels
  1304. and word[-3] not in self.__vowels
  1305. ) or (
  1306. r1 == ""
  1307. and len(word) == 2
  1308. and word[0] in self.__vowels
  1309. and word[1] not in self.__vowels
  1310. ):
  1311. word = "".join((word, "e"))
  1312. if len(r1) > 0:
  1313. r1 = "".join((r1, "e"))
  1314. if len(r2) > 0:
  1315. r2 = "".join((r2, "e"))
  1316. break
  1317. # STEP 1c
  1318. if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels:
  1319. word = "".join((word[:-1], "i"))
  1320. if len(r1) >= 1:
  1321. r1 = "".join((r1[:-1], "i"))
  1322. else:
  1323. r1 = ""
  1324. if len(r2) >= 1:
  1325. r2 = "".join((r2[:-1], "i"))
  1326. else:
  1327. r2 = ""
  1328. # STEP 2
  1329. for suffix in self.__step2_suffixes:
  1330. if word.endswith(suffix):
  1331. if r1.endswith(suffix):
  1332. if suffix == "tional":
  1333. word = word[:-2]
  1334. r1 = r1[:-2]
  1335. r2 = r2[:-2]
  1336. elif suffix in ("enci", "anci", "abli"):
  1337. word = "".join((word[:-1], "e"))
  1338. if len(r1) >= 1:
  1339. r1 = "".join((r1[:-1], "e"))
  1340. else:
  1341. r1 = ""
  1342. if len(r2) >= 1:
  1343. r2 = "".join((r2[:-1], "e"))
  1344. else:
  1345. r2 = ""
  1346. elif suffix == "entli":
  1347. word = word[:-2]
  1348. r1 = r1[:-2]
  1349. r2 = r2[:-2]
  1350. elif suffix in ("izer", "ization"):
  1351. word = suffix_replace(word, suffix, "ize")
  1352. if len(r1) >= len(suffix):
  1353. r1 = suffix_replace(r1, suffix, "ize")
  1354. else:
  1355. r1 = ""
  1356. if len(r2) >= len(suffix):
  1357. r2 = suffix_replace(r2, suffix, "ize")
  1358. else:
  1359. r2 = ""
  1360. elif suffix in ("ational", "ation", "ator"):
  1361. word = suffix_replace(word, suffix, "ate")
  1362. if len(r1) >= len(suffix):
  1363. r1 = suffix_replace(r1, suffix, "ate")
  1364. else:
  1365. r1 = ""
  1366. if len(r2) >= len(suffix):
  1367. r2 = suffix_replace(r2, suffix, "ate")
  1368. else:
  1369. r2 = "e"
  1370. elif suffix in ("alism", "aliti", "alli"):
  1371. word = suffix_replace(word, suffix, "al")
  1372. if len(r1) >= len(suffix):
  1373. r1 = suffix_replace(r1, suffix, "al")
  1374. else:
  1375. r1 = ""
  1376. if len(r2) >= len(suffix):
  1377. r2 = suffix_replace(r2, suffix, "al")
  1378. else:
  1379. r2 = ""
  1380. elif suffix == "fulness":
  1381. word = word[:-4]
  1382. r1 = r1[:-4]
  1383. r2 = r2[:-4]
  1384. elif suffix in ("ousli", "ousness"):
  1385. word = suffix_replace(word, suffix, "ous")
  1386. if len(r1) >= len(suffix):
  1387. r1 = suffix_replace(r1, suffix, "ous")
  1388. else:
  1389. r1 = ""
  1390. if len(r2) >= len(suffix):
  1391. r2 = suffix_replace(r2, suffix, "ous")
  1392. else:
  1393. r2 = ""
  1394. elif suffix in ("iveness", "iviti"):
  1395. word = suffix_replace(word, suffix, "ive")
  1396. if len(r1) >= len(suffix):
  1397. r1 = suffix_replace(r1, suffix, "ive")
  1398. else:
  1399. r1 = ""
  1400. if len(r2) >= len(suffix):
  1401. r2 = suffix_replace(r2, suffix, "ive")
  1402. else:
  1403. r2 = "e"
  1404. elif suffix in ("biliti", "bli"):
  1405. word = suffix_replace(word, suffix, "ble")
  1406. if len(r1) >= len(suffix):
  1407. r1 = suffix_replace(r1, suffix, "ble")
  1408. else:
  1409. r1 = ""
  1410. if len(r2) >= len(suffix):
  1411. r2 = suffix_replace(r2, suffix, "ble")
  1412. else:
  1413. r2 = ""
  1414. elif suffix == "ogi" and word[-4] == "l":
  1415. word = word[:-1]
  1416. r1 = r1[:-1]
  1417. r2 = r2[:-1]
  1418. elif suffix in ("fulli", "lessli"):
  1419. word = word[:-2]
  1420. r1 = r1[:-2]
  1421. r2 = r2[:-2]
  1422. elif suffix == "li" and word[-3] in self.__li_ending:
  1423. word = word[:-2]
  1424. r1 = r1[:-2]
  1425. r2 = r2[:-2]
  1426. break
  1427. # STEP 3
  1428. for suffix in self.__step3_suffixes:
  1429. if word.endswith(suffix):
  1430. if r1.endswith(suffix):
  1431. if suffix == "tional":
  1432. word = word[:-2]
  1433. r1 = r1[:-2]
  1434. r2 = r2[:-2]
  1435. elif suffix == "ational":
  1436. word = suffix_replace(word, suffix, "ate")
  1437. if len(r1) >= len(suffix):
  1438. r1 = suffix_replace(r1, suffix, "ate")
  1439. else:
  1440. r1 = ""
  1441. if len(r2) >= len(suffix):
  1442. r2 = suffix_replace(r2, suffix, "ate")
  1443. else:
  1444. r2 = ""
  1445. elif suffix == "alize":
  1446. word = word[:-3]
  1447. r1 = r1[:-3]
  1448. r2 = r2[:-3]
  1449. elif suffix in ("icate", "iciti", "ical"):
  1450. word = suffix_replace(word, suffix, "ic")
  1451. if len(r1) >= len(suffix):
  1452. r1 = suffix_replace(r1, suffix, "ic")
  1453. else:
  1454. r1 = ""
  1455. if len(r2) >= len(suffix):
  1456. r2 = suffix_replace(r2, suffix, "ic")
  1457. else:
  1458. r2 = ""
  1459. elif suffix in ("ful", "ness"):
  1460. word = word[: -len(suffix)]
  1461. r1 = r1[: -len(suffix)]
  1462. r2 = r2[: -len(suffix)]
  1463. elif suffix == "ative" and r2.endswith(suffix):
  1464. word = word[:-5]
  1465. r1 = r1[:-5]
  1466. r2 = r2[:-5]
  1467. break
  1468. # STEP 4
  1469. for suffix in self.__step4_suffixes:
  1470. if word.endswith(suffix):
  1471. if r2.endswith(suffix):
  1472. if suffix == "ion":
  1473. if word[-4] in "st":
  1474. word = word[:-3]
  1475. r1 = r1[:-3]
  1476. r2 = r2[:-3]
  1477. else:
  1478. word = word[: -len(suffix)]
  1479. r1 = r1[: -len(suffix)]
  1480. r2 = r2[: -len(suffix)]
  1481. break
  1482. # STEP 5
  1483. if r2.endswith("l") and word[-2] == "l":
  1484. word = word[:-1]
  1485. elif r2.endswith("e"):
  1486. word = word[:-1]
  1487. elif r1.endswith("e"):
  1488. if len(word) >= 4 and (
  1489. word[-2] in self.__vowels
  1490. or word[-2] in "wxY"
  1491. or word[-3] not in self.__vowels
  1492. or word[-4] in self.__vowels
  1493. ):
  1494. word = word[:-1]
  1495. word = word.replace("Y", "y")
  1496. return word
  1497. class FinnishStemmer(_StandardStemmer):
  1498. """
  1499. The Finnish Snowball stemmer.
  1500. :cvar __vowels: The Finnish vowels.
  1501. :type __vowels: unicode
  1502. :cvar __restricted_vowels: A subset of the Finnish vowels.
  1503. :type __restricted_vowels: unicode
  1504. :cvar __long_vowels: The Finnish vowels in their long forms.
  1505. :type __long_vowels: tuple
  1506. :cvar __consonants: The Finnish consonants.
  1507. :type __consonants: unicode
  1508. :cvar __double_consonants: The Finnish double consonants.
  1509. :type __double_consonants: tuple
  1510. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  1511. :type __step1_suffixes: tuple
  1512. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  1513. :type __step2_suffixes: tuple
  1514. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  1515. :type __step3_suffixes: tuple
  1516. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  1517. :type __step4_suffixes: tuple
  1518. :note: A detailed description of the Finnish
  1519. stemming algorithm can be found under
  1520. http://snowball.tartarus.org/algorithms/finnish/stemmer.html
  1521. """
  1522. __vowels = "aeiouy\xE4\xF6"
  1523. __restricted_vowels = "aeiou\xE4\xF6"
  1524. __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6")
  1525. __consonants = "bcdfghjklmnpqrstvwxz"
  1526. __double_consonants = (
  1527. "bb",
  1528. "cc",
  1529. "dd",
  1530. "ff",
  1531. "gg",
  1532. "hh",
  1533. "jj",
  1534. "kk",
  1535. "ll",
  1536. "mm",
  1537. "nn",
  1538. "pp",
  1539. "qq",
  1540. "rr",
  1541. "ss",
  1542. "tt",
  1543. "vv",
  1544. "ww",
  1545. "xx",
  1546. "zz",
  1547. )
  1548. __step1_suffixes = (
  1549. 'kaan',
  1550. 'k\xE4\xE4n',
  1551. 'sti',
  1552. 'kin',
  1553. 'han',
  1554. 'h\xE4n',
  1555. 'ko',
  1556. 'k\xF6',
  1557. 'pa',
  1558. 'p\xE4',
  1559. )
  1560. __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni', 'an', '\xE4n', 'en')
  1561. __step3_suffixes = (
  1562. 'siin',
  1563. 'tten',
  1564. 'seen',
  1565. 'han',
  1566. 'hen',
  1567. 'hin',
  1568. 'hon',
  1569. 'h\xE4n',
  1570. 'h\xF6n',
  1571. 'den',
  1572. 'tta',
  1573. 'tt\xE4',
  1574. 'ssa',
  1575. 'ss\xE4',
  1576. 'sta',
  1577. 'st\xE4',
  1578. 'lla',
  1579. 'll\xE4',
  1580. 'lta',
  1581. 'lt\xE4',
  1582. 'lle',
  1583. 'ksi',
  1584. 'ine',
  1585. 'ta',
  1586. 't\xE4',
  1587. 'na',
  1588. 'n\xE4',
  1589. 'a',
  1590. '\xE4',
  1591. 'n',
  1592. )
  1593. __step4_suffixes = (
  1594. 'impi',
  1595. 'impa',
  1596. 'imp\xE4',
  1597. 'immi',
  1598. 'imma',
  1599. 'imm\xE4',
  1600. 'mpi',
  1601. 'mpa',
  1602. 'mp\xE4',
  1603. 'mmi',
  1604. 'mma',
  1605. 'mm\xE4',
  1606. 'eja',
  1607. 'ej\xE4',
  1608. )
  1609. def stem(self, word):
  1610. """
  1611. Stem a Finnish word and return the stemmed form.
  1612. :param word: The word that is stemmed.
  1613. :type word: str or unicode
  1614. :return: The stemmed form.
  1615. :rtype: unicode
  1616. """
  1617. word = word.lower()
  1618. if word in self.stopwords:
  1619. return word
  1620. step3_success = False
  1621. r1, r2 = self._r1r2_standard(word, self.__vowels)
  1622. # STEP 1: Particles etc.
  1623. for suffix in self.__step1_suffixes:
  1624. if r1.endswith(suffix):
  1625. if suffix == "sti":
  1626. if suffix in r2:
  1627. word = word[:-3]
  1628. r1 = r1[:-3]
  1629. r2 = r2[:-3]
  1630. else:
  1631. if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6":
  1632. word = word[: -len(suffix)]
  1633. r1 = r1[: -len(suffix)]
  1634. r2 = r2[: -len(suffix)]
  1635. break
  1636. # STEP 2: Possessives
  1637. for suffix in self.__step2_suffixes:
  1638. if r1.endswith(suffix):
  1639. if suffix == "si":
  1640. if word[-3] != "k":
  1641. word = word[:-2]
  1642. r1 = r1[:-2]
  1643. r2 = r2[:-2]
  1644. elif suffix == "ni":
  1645. word = word[:-2]
  1646. r1 = r1[:-2]
  1647. r2 = r2[:-2]
  1648. if word.endswith("kse"):
  1649. word = suffix_replace(word, "kse", "ksi")
  1650. if r1.endswith("kse"):
  1651. r1 = suffix_replace(r1, "kse", "ksi")
  1652. if r2.endswith("kse"):
  1653. r2 = suffix_replace(r2, "kse", "ksi")
  1654. elif suffix == "an":
  1655. if word[-4:-2] in ("ta", "na") or word[-5:-2] in (
  1656. "ssa",
  1657. "sta",
  1658. "lla",
  1659. "lta",
  1660. ):
  1661. word = word[:-2]
  1662. r1 = r1[:-2]
  1663. r2 = r2[:-2]
  1664. elif suffix == "\xE4n":
  1665. if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in (
  1666. "ss\xE4",
  1667. "st\xE4",
  1668. "ll\xE4",
  1669. "lt\xE4",
  1670. ):
  1671. word = word[:-2]
  1672. r1 = r1[:-2]
  1673. r2 = r2[:-2]
  1674. elif suffix == "en":
  1675. if word[-5:-2] in ("lle", "ine"):
  1676. word = word[:-2]
  1677. r1 = r1[:-2]
  1678. r2 = r2[:-2]
  1679. else:
  1680. word = word[:-3]
  1681. r1 = r1[:-3]
  1682. r2 = r2[:-3]
  1683. break
  1684. # STEP 3: Cases
  1685. for suffix in self.__step3_suffixes:
  1686. if r1.endswith(suffix):
  1687. if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"):
  1688. if (
  1689. (suffix == "han" and word[-4] == "a")
  1690. or (suffix == "hen" and word[-4] == "e")
  1691. or (suffix == "hin" and word[-4] == "i")
  1692. or (suffix == "hon" and word[-4] == "o")
  1693. or (suffix == "h\xE4n" and word[-4] == "\xE4")
  1694. or (suffix == "h\xF6n" and word[-4] == "\xF6")
  1695. ):
  1696. word = word[:-3]
  1697. r1 = r1[:-3]
  1698. r2 = r2[:-3]
  1699. step3_success = True
  1700. elif suffix in ("siin", "den", "tten"):
  1701. if (
  1702. word[-len(suffix) - 1] == "i"
  1703. and word[-len(suffix) - 2] in self.__restricted_vowels
  1704. ):
  1705. word = word[: -len(suffix)]
  1706. r1 = r1[: -len(suffix)]
  1707. r2 = r2[: -len(suffix)]
  1708. step3_success = True
  1709. else:
  1710. continue
  1711. elif suffix == "seen":
  1712. if word[-6:-4] in self.__long_vowels:
  1713. word = word[:-4]
  1714. r1 = r1[:-4]
  1715. r2 = r2[:-4]
  1716. step3_success = True
  1717. else:
  1718. continue
  1719. elif suffix in ("a", "\xE4"):
  1720. if word[-2] in self.__vowels and word[-3] in self.__consonants:
  1721. word = word[:-1]
  1722. r1 = r1[:-1]
  1723. r2 = r2[:-1]
  1724. step3_success = True
  1725. elif suffix in ("tta", "tt\xE4"):
  1726. if word[-4] == "e":
  1727. word = word[:-3]
  1728. r1 = r1[:-3]
  1729. r2 = r2[:-3]
  1730. step3_success = True
  1731. elif suffix == "n":
  1732. word = word[:-1]
  1733. r1 = r1[:-1]
  1734. r2 = r2[:-1]
  1735. step3_success = True
  1736. if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
  1737. word = word[:-1]
  1738. r1 = r1[:-1]
  1739. r2 = r2[:-1]
  1740. else:
  1741. word = word[: -len(suffix)]
  1742. r1 = r1[: -len(suffix)]
  1743. r2 = r2[: -len(suffix)]
  1744. step3_success = True
  1745. break
  1746. # STEP 4: Other endings
  1747. for suffix in self.__step4_suffixes:
  1748. if r2.endswith(suffix):
  1749. if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"):
  1750. if word[-5:-3] != "po":
  1751. word = word[:-3]
  1752. r1 = r1[:-3]
  1753. r2 = r2[:-3]
  1754. else:
  1755. word = word[: -len(suffix)]
  1756. r1 = r1[: -len(suffix)]
  1757. r2 = r2[: -len(suffix)]
  1758. break
  1759. # STEP 5: Plurals
  1760. if step3_success and len(r1) >= 1 and r1[-1] in "ij":
  1761. word = word[:-1]
  1762. r1 = r1[:-1]
  1763. elif (
  1764. not step3_success
  1765. and len(r1) >= 2
  1766. and r1[-1] == "t"
  1767. and r1[-2] in self.__vowels
  1768. ):
  1769. word = word[:-1]
  1770. r1 = r1[:-1]
  1771. r2 = r2[:-1]
  1772. if r2.endswith("imma"):
  1773. word = word[:-4]
  1774. r1 = r1[:-4]
  1775. elif r2.endswith("mma") and r2[-5:-3] != "po":
  1776. word = word[:-3]
  1777. r1 = r1[:-3]
  1778. # STEP 6: Tidying up
  1779. if r1[-2:] in self.__long_vowels:
  1780. word = word[:-1]
  1781. r1 = r1[:-1]
  1782. if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei":
  1783. word = word[:-1]
  1784. r1 = r1[:-1]
  1785. if r1.endswith(("oj", "uj")):
  1786. word = word[:-1]
  1787. r1 = r1[:-1]
  1788. if r1.endswith("jo"):
  1789. word = word[:-1]
  1790. r1 = r1[:-1]
  1791. # If the word ends with a double consonant
  1792. # followed by zero or more vowels, the last consonant is removed.
  1793. for i in range(1, len(word)):
  1794. if word[-i] in self.__vowels:
  1795. continue
  1796. else:
  1797. if i == 1:
  1798. if word[-i - 1 :] in self.__double_consonants:
  1799. word = word[:-1]
  1800. else:
  1801. if word[-i - 1 : -i + 1] in self.__double_consonants:
  1802. word = "".join((word[:-i], word[-i + 1 :]))
  1803. break
  1804. return word
  1805. class FrenchStemmer(_StandardStemmer):
  1806. """
  1807. The French Snowball stemmer.
  1808. :cvar __vowels: The French vowels.
  1809. :type __vowels: unicode
  1810. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  1811. :type __step1_suffixes: tuple
  1812. :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
  1813. :type __step2a_suffixes: tuple
  1814. :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
  1815. :type __step2b_suffixes: tuple
  1816. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  1817. :type __step4_suffixes: tuple
  1818. :note: A detailed description of the French
  1819. stemming algorithm can be found under
  1820. http://snowball.tartarus.org/algorithms/french/stemmer.html
  1821. """
  1822. __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9"
  1823. __step1_suffixes = (
  1824. 'issements',
  1825. 'issement',
  1826. 'atrices',
  1827. 'atrice',
  1828. 'ateurs',
  1829. 'ations',
  1830. 'logies',
  1831. 'usions',
  1832. 'utions',
  1833. 'ements',
  1834. 'amment',
  1835. 'emment',
  1836. 'ances',
  1837. 'iqUes',
  1838. 'ismes',
  1839. 'ables',
  1840. 'istes',
  1841. 'ateur',
  1842. 'ation',
  1843. 'logie',
  1844. 'usion',
  1845. 'ution',
  1846. 'ences',
  1847. 'ement',
  1848. 'euses',
  1849. 'ments',
  1850. 'ance',
  1851. 'iqUe',
  1852. 'isme',
  1853. 'able',
  1854. 'iste',
  1855. 'ence',
  1856. 'it\xE9s',
  1857. 'ives',
  1858. 'eaux',
  1859. 'euse',
  1860. 'ment',
  1861. 'eux',
  1862. 'it\xE9',
  1863. 'ive',
  1864. 'ifs',
  1865. 'aux',
  1866. 'if',
  1867. )
  1868. __step2a_suffixes = (
  1869. 'issaIent',
  1870. 'issantes',
  1871. 'iraIent',
  1872. 'issante',
  1873. 'issants',
  1874. 'issions',
  1875. 'irions',
  1876. 'issais',
  1877. 'issait',
  1878. 'issant',
  1879. 'issent',
  1880. 'issiez',
  1881. 'issons',
  1882. 'irais',
  1883. 'irait',
  1884. 'irent',
  1885. 'iriez',
  1886. 'irons',
  1887. 'iront',
  1888. 'isses',
  1889. 'issez',
  1890. '\xEEmes',
  1891. '\xEEtes',
  1892. 'irai',
  1893. 'iras',
  1894. 'irez',
  1895. 'isse',
  1896. 'ies',
  1897. 'ira',
  1898. '\xEEt',
  1899. 'ie',
  1900. 'ir',
  1901. 'is',
  1902. 'it',
  1903. 'i',
  1904. )
  1905. __step2b_suffixes = (
  1906. 'eraIent',
  1907. 'assions',
  1908. 'erions',
  1909. 'assent',
  1910. 'assiez',
  1911. '\xE8rent',
  1912. 'erais',
  1913. 'erait',
  1914. 'eriez',
  1915. 'erons',
  1916. 'eront',
  1917. 'aIent',
  1918. 'antes',
  1919. 'asses',
  1920. 'ions',
  1921. 'erai',
  1922. 'eras',
  1923. 'erez',
  1924. '\xE2mes',
  1925. '\xE2tes',
  1926. 'ante',
  1927. 'ants',
  1928. 'asse',
  1929. '\xE9es',
  1930. 'era',
  1931. 'iez',
  1932. 'ais',
  1933. 'ait',
  1934. 'ant',
  1935. '\xE9e',
  1936. '\xE9s',
  1937. 'er',
  1938. 'ez',
  1939. '\xE2t',
  1940. 'ai',
  1941. 'as',
  1942. '\xE9',
  1943. 'a',
  1944. )
  1945. __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier', 'e', '\xEB')
  1946. def stem(self, word):
  1947. """
  1948. Stem a French word and return the stemmed form.
  1949. :param word: The word that is stemmed.
  1950. :type word: str or unicode
  1951. :return: The stemmed form.
  1952. :rtype: unicode
  1953. """
  1954. word = word.lower()
  1955. if word in self.stopwords:
  1956. return word
  1957. step1_success = False
  1958. rv_ending_found = False
  1959. step2a_success = False
  1960. step2b_success = False
  1961. # Every occurrence of 'u' after 'q' is put into upper case.
  1962. for i in range(1, len(word)):
  1963. if word[i - 1] == "q" and word[i] == "u":
  1964. word = "".join((word[:i], "U", word[i + 1 :]))
  1965. # Every occurrence of 'u' and 'i'
  1966. # between vowels is put into upper case.
  1967. # Every occurrence of 'y' preceded or
  1968. # followed by a vowel is also put into upper case.
  1969. for i in range(1, len(word) - 1):
  1970. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  1971. if word[i] == "u":
  1972. word = "".join((word[:i], "U", word[i + 1 :]))
  1973. elif word[i] == "i":
  1974. word = "".join((word[:i], "I", word[i + 1 :]))
  1975. if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
  1976. if word[i] == "y":
  1977. word = "".join((word[:i], "Y", word[i + 1 :]))
  1978. r1, r2 = self._r1r2_standard(word, self.__vowels)
  1979. rv = self.__rv_french(word, self.__vowels)
  1980. # STEP 1: Standard suffix removal
  1981. for suffix in self.__step1_suffixes:
  1982. if word.endswith(suffix):
  1983. if suffix == "eaux":
  1984. word = word[:-1]
  1985. step1_success = True
  1986. elif suffix in ("euse", "euses"):
  1987. if suffix in r2:
  1988. word = word[: -len(suffix)]
  1989. step1_success = True
  1990. elif suffix in r1:
  1991. word = suffix_replace(word, suffix, "eux")
  1992. step1_success = True
  1993. elif suffix in ("ement", "ements") and suffix in rv:
  1994. word = word[: -len(suffix)]
  1995. step1_success = True
  1996. if word[-2:] == "iv" and "iv" in r2:
  1997. word = word[:-2]
  1998. if word[-2:] == "at" and "at" in r2:
  1999. word = word[:-2]
  2000. elif word[-3:] == "eus":
  2001. if "eus" in r2:
  2002. word = word[:-3]
  2003. elif "eus" in r1:
  2004. word = "".join((word[:-1], "x"))
  2005. elif word[-3:] in ("abl", "iqU"):
  2006. if "abl" in r2 or "iqU" in r2:
  2007. word = word[:-3]
  2008. elif word[-3:] in ("i\xE8r", "I\xE8r"):
  2009. if "i\xE8r" in rv or "I\xE8r" in rv:
  2010. word = "".join((word[:-3], "i"))
  2011. elif suffix == "amment" and suffix in rv:
  2012. word = suffix_replace(word, "amment", "ant")
  2013. rv = suffix_replace(rv, "amment", "ant")
  2014. rv_ending_found = True
  2015. elif suffix == "emment" and suffix in rv:
  2016. word = suffix_replace(word, "emment", "ent")
  2017. rv_ending_found = True
  2018. elif (
  2019. suffix in ("ment", "ments")
  2020. and suffix in rv
  2021. and not rv.startswith(suffix)
  2022. and rv[rv.rindex(suffix) - 1] in self.__vowels
  2023. ):
  2024. word = word[: -len(suffix)]
  2025. rv = rv[: -len(suffix)]
  2026. rv_ending_found = True
  2027. elif suffix == "aux" and suffix in r1:
  2028. word = "".join((word[:-2], "l"))
  2029. step1_success = True
  2030. elif (
  2031. suffix in ("issement", "issements")
  2032. and suffix in r1
  2033. and word[-len(suffix) - 1] not in self.__vowels
  2034. ):
  2035. word = word[: -len(suffix)]
  2036. step1_success = True
  2037. elif (
  2038. suffix
  2039. in (
  2040. "ance",
  2041. "iqUe",
  2042. "isme",
  2043. "able",
  2044. "iste",
  2045. "eux",
  2046. "ances",
  2047. "iqUes",
  2048. "ismes",
  2049. "ables",
  2050. "istes",
  2051. )
  2052. and suffix in r2
  2053. ):
  2054. word = word[: -len(suffix)]
  2055. step1_success = True
  2056. elif (
  2057. suffix
  2058. in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations")
  2059. and suffix in r2
  2060. ):
  2061. word = word[: -len(suffix)]
  2062. step1_success = True
  2063. if word[-2:] == "ic":
  2064. if "ic" in r2:
  2065. word = word[:-2]
  2066. else:
  2067. word = "".join((word[:-2], "iqU"))
  2068. elif suffix in ("logie", "logies") and suffix in r2:
  2069. word = suffix_replace(word, suffix, "log")
  2070. step1_success = True
  2071. elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2:
  2072. word = suffix_replace(word, suffix, "u")
  2073. step1_success = True
  2074. elif suffix in ("ence", "ences") and suffix in r2:
  2075. word = suffix_replace(word, suffix, "ent")
  2076. step1_success = True
  2077. elif suffix in ("it\xE9", "it\xE9s") and suffix in r2:
  2078. word = word[: -len(suffix)]
  2079. step1_success = True
  2080. if word[-4:] == "abil":
  2081. if "abil" in r2:
  2082. word = word[:-4]
  2083. else:
  2084. word = "".join((word[:-2], "l"))
  2085. elif word[-2:] == "ic":
  2086. if "ic" in r2:
  2087. word = word[:-2]
  2088. else:
  2089. word = "".join((word[:-2], "iqU"))
  2090. elif word[-2:] == "iv":
  2091. if "iv" in r2:
  2092. word = word[:-2]
  2093. elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2:
  2094. word = word[: -len(suffix)]
  2095. step1_success = True
  2096. if word[-2:] == "at" and "at" in r2:
  2097. word = word[:-2]
  2098. if word[-2:] == "ic":
  2099. if "ic" in r2:
  2100. word = word[:-2]
  2101. else:
  2102. word = "".join((word[:-2], "iqU"))
  2103. break
  2104. # STEP 2a: Verb suffixes beginning 'i'
  2105. if not step1_success or rv_ending_found:
  2106. for suffix in self.__step2a_suffixes:
  2107. if word.endswith(suffix):
  2108. if (
  2109. suffix in rv
  2110. and len(rv) > len(suffix)
  2111. and rv[rv.rindex(suffix) - 1] not in self.__vowels
  2112. ):
  2113. word = word[: -len(suffix)]
  2114. step2a_success = True
  2115. break
  2116. # STEP 2b: Other verb suffixes
  2117. if not step2a_success:
  2118. for suffix in self.__step2b_suffixes:
  2119. if rv.endswith(suffix):
  2120. if suffix == "ions" and "ions" in r2:
  2121. word = word[:-4]
  2122. step2b_success = True
  2123. elif suffix in (
  2124. 'eraIent',
  2125. 'erions',
  2126. '\xE8rent',
  2127. 'erais',
  2128. 'erait',
  2129. 'eriez',
  2130. 'erons',
  2131. 'eront',
  2132. 'erai',
  2133. 'eras',
  2134. 'erez',
  2135. '\xE9es',
  2136. 'era',
  2137. 'iez',
  2138. '\xE9e',
  2139. '\xE9s',
  2140. 'er',
  2141. 'ez',
  2142. '\xE9',
  2143. ):
  2144. word = word[: -len(suffix)]
  2145. step2b_success = True
  2146. elif suffix in (
  2147. 'assions',
  2148. 'assent',
  2149. 'assiez',
  2150. 'aIent',
  2151. 'antes',
  2152. 'asses',
  2153. '\xE2mes',
  2154. '\xE2tes',
  2155. 'ante',
  2156. 'ants',
  2157. 'asse',
  2158. 'ais',
  2159. 'ait',
  2160. 'ant',
  2161. '\xE2t',
  2162. 'ai',
  2163. 'as',
  2164. 'a',
  2165. ):
  2166. word = word[: -len(suffix)]
  2167. rv = rv[: -len(suffix)]
  2168. step2b_success = True
  2169. if rv.endswith("e"):
  2170. word = word[:-1]
  2171. break
  2172. # STEP 3
  2173. if step1_success or step2a_success or step2b_success:
  2174. if word[-1] == "Y":
  2175. word = "".join((word[:-1], "i"))
  2176. elif word[-1] == "\xE7":
  2177. word = "".join((word[:-1], "c"))
  2178. # STEP 4: Residual suffixes
  2179. else:
  2180. if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s":
  2181. word = word[:-1]
  2182. for suffix in self.__step4_suffixes:
  2183. if word.endswith(suffix):
  2184. if suffix in rv:
  2185. if suffix == "ion" and suffix in r2 and rv[-4] in "st":
  2186. word = word[:-3]
  2187. elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"):
  2188. word = suffix_replace(word, suffix, "i")
  2189. elif suffix == "e":
  2190. word = word[:-1]
  2191. elif suffix == "\xEB" and word[-3:-1] == "gu":
  2192. word = word[:-1]
  2193. break
  2194. # STEP 5: Undouble
  2195. if word.endswith(("enn", "onn", "ett", "ell", "eill")):
  2196. word = word[:-1]
  2197. # STEP 6: Un-accent
  2198. for i in range(1, len(word)):
  2199. if word[-i] not in self.__vowels:
  2200. i += 1
  2201. else:
  2202. if i != 1 and word[-i] in ("\xE9", "\xE8"):
  2203. word = "".join((word[:-i], "e", word[-i + 1 :]))
  2204. break
  2205. word = word.replace("I", "i").replace("U", "u").replace("Y", "y")
  2206. return word
  2207. def __rv_french(self, word, vowels):
  2208. """
  2209. Return the region RV that is used by the French stemmer.
  2210. If the word begins with two vowels, RV is the region after
  2211. the third letter. Otherwise, it is the region after the first
  2212. vowel not at the beginning of the word, or the end of the word
  2213. if these positions cannot be found. (Exceptionally, u'par',
  2214. u'col' or u'tap' at the beginning of a word is also taken to
  2215. define RV as the region to their right.)
  2216. :param word: The French word whose region RV is determined.
  2217. :type word: str or unicode
  2218. :param vowels: The French vowels that are used to determine
  2219. the region RV.
  2220. :type vowels: unicode
  2221. :return: the region RV for the respective French word.
  2222. :rtype: unicode
  2223. :note: This helper method is invoked by the stem method of
  2224. the subclass FrenchStemmer. It is not to be invoked directly!
  2225. """
  2226. rv = ""
  2227. if len(word) >= 2:
  2228. if word.startswith(("par", "col", "tap")) or (
  2229. word[0] in vowels and word[1] in vowels
  2230. ):
  2231. rv = word[3:]
  2232. else:
  2233. for i in range(1, len(word)):
  2234. if word[i] in vowels:
  2235. rv = word[i + 1 :]
  2236. break
  2237. return rv
  2238. class GermanStemmer(_StandardStemmer):
  2239. """
  2240. The German Snowball stemmer.
  2241. :cvar __vowels: The German vowels.
  2242. :type __vowels: unicode
  2243. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  2244. :type __s_ending: unicode
  2245. :cvar __st_ending: Letter that may directly appear before a word final 'st'.
  2246. :type __st_ending: unicode
  2247. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  2248. :type __step1_suffixes: tuple
  2249. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  2250. :type __step2_suffixes: tuple
  2251. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  2252. :type __step3_suffixes: tuple
  2253. :note: A detailed description of the German
  2254. stemming algorithm can be found under
  2255. http://snowball.tartarus.org/algorithms/german/stemmer.html
  2256. """
  2257. __vowels = "aeiouy\xE4\xF6\xFC"
  2258. __s_ending = "bdfghklmnrt"
  2259. __st_ending = "bdfghklmnt"
  2260. __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
  2261. __step2_suffixes = ("est", "en", "er", "st")
  2262. __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik")
  2263. def stem(self, word):
  2264. """
  2265. Stem a German word and return the stemmed form.
  2266. :param word: The word that is stemmed.
  2267. :type word: str or unicode
  2268. :return: The stemmed form.
  2269. :rtype: unicode
  2270. """
  2271. word = word.lower()
  2272. if word in self.stopwords:
  2273. return word
  2274. word = word.replace("\xDF", "ss")
  2275. # Every occurrence of 'u' and 'y'
  2276. # between vowels is put into upper case.
  2277. for i in range(1, len(word) - 1):
  2278. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  2279. if word[i] == "u":
  2280. word = "".join((word[:i], "U", word[i + 1 :]))
  2281. elif word[i] == "y":
  2282. word = "".join((word[:i], "Y", word[i + 1 :]))
  2283. r1, r2 = self._r1r2_standard(word, self.__vowels)
  2284. # R1 is adjusted so that the region before it
  2285. # contains at least 3 letters.
  2286. for i in range(1, len(word)):
  2287. if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
  2288. if len(word[: i + 1]) < 3 and len(word[: i + 1]) > 0:
  2289. r1 = word[3:]
  2290. elif len(word[: i + 1]) == 0:
  2291. return word
  2292. break
  2293. # STEP 1
  2294. for suffix in self.__step1_suffixes:
  2295. if r1.endswith(suffix):
  2296. if (
  2297. suffix in ("en", "es", "e")
  2298. and word[-len(suffix) - 4 : -len(suffix)] == "niss"
  2299. ):
  2300. word = word[: -len(suffix) - 1]
  2301. r1 = r1[: -len(suffix) - 1]
  2302. r2 = r2[: -len(suffix) - 1]
  2303. elif suffix == "s":
  2304. if word[-2] in self.__s_ending:
  2305. word = word[:-1]
  2306. r1 = r1[:-1]
  2307. r2 = r2[:-1]
  2308. else:
  2309. word = word[: -len(suffix)]
  2310. r1 = r1[: -len(suffix)]
  2311. r2 = r2[: -len(suffix)]
  2312. break
  2313. # STEP 2
  2314. for suffix in self.__step2_suffixes:
  2315. if r1.endswith(suffix):
  2316. if suffix == "st":
  2317. if word[-3] in self.__st_ending and len(word[:-3]) >= 3:
  2318. word = word[:-2]
  2319. r1 = r1[:-2]
  2320. r2 = r2[:-2]
  2321. else:
  2322. word = word[: -len(suffix)]
  2323. r1 = r1[: -len(suffix)]
  2324. r2 = r2[: -len(suffix)]
  2325. break
  2326. # STEP 3: Derivational suffixes
  2327. for suffix in self.__step3_suffixes:
  2328. if r2.endswith(suffix):
  2329. if suffix in ("end", "ung"):
  2330. if (
  2331. "ig" in r2[-len(suffix) - 2 : -len(suffix)]
  2332. and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2]
  2333. ):
  2334. word = word[: -len(suffix) - 2]
  2335. else:
  2336. word = word[: -len(suffix)]
  2337. elif (
  2338. suffix in ("ig", "ik", "isch")
  2339. and "e" not in r2[-len(suffix) - 1 : -len(suffix)]
  2340. ):
  2341. word = word[: -len(suffix)]
  2342. elif suffix in ("lich", "heit"):
  2343. if (
  2344. "er" in r1[-len(suffix) - 2 : -len(suffix)]
  2345. or "en" in r1[-len(suffix) - 2 : -len(suffix)]
  2346. ):
  2347. word = word[: -len(suffix) - 2]
  2348. else:
  2349. word = word[: -len(suffix)]
  2350. elif suffix == "keit":
  2351. if "lich" in r2[-len(suffix) - 4 : -len(suffix)]:
  2352. word = word[: -len(suffix) - 4]
  2353. elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]:
  2354. word = word[: -len(suffix) - 2]
  2355. else:
  2356. word = word[: -len(suffix)]
  2357. break
  2358. # Umlaut accents are removed and
  2359. # 'u' and 'y' are put back into lower case.
  2360. word = (
  2361. word.replace("\xE4", "a")
  2362. .replace("\xF6", "o")
  2363. .replace("\xFC", "u")
  2364. .replace("U", "u")
  2365. .replace("Y", "y")
  2366. )
  2367. return word
  2368. class HungarianStemmer(_LanguageSpecificStemmer):
  2369. """
  2370. The Hungarian Snowball stemmer.
  2371. :cvar __vowels: The Hungarian vowels.
  2372. :type __vowels: unicode
  2373. :cvar __digraphs: The Hungarian digraphs.
  2374. :type __digraphs: tuple
  2375. :cvar __double_consonants: The Hungarian double consonants.
  2376. :type __double_consonants: tuple
  2377. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  2378. :type __step1_suffixes: tuple
  2379. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  2380. :type __step2_suffixes: tuple
  2381. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  2382. :type __step3_suffixes: tuple
  2383. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  2384. :type __step4_suffixes: tuple
  2385. :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
  2386. :type __step5_suffixes: tuple
  2387. :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
  2388. :type __step6_suffixes: tuple
  2389. :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
  2390. :type __step7_suffixes: tuple
  2391. :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
  2392. :type __step8_suffixes: tuple
  2393. :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
  2394. :type __step9_suffixes: tuple
  2395. :note: A detailed description of the Hungarian
  2396. stemming algorithm can be found under
  2397. http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
  2398. """
  2399. __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB"
  2400. __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
  2401. __double_consonants = (
  2402. "bb",
  2403. "cc",
  2404. "ccs",
  2405. "dd",
  2406. "ff",
  2407. "gg",
  2408. "ggy",
  2409. "jj",
  2410. "kk",
  2411. "ll",
  2412. "lly",
  2413. "mm",
  2414. "nn",
  2415. "nny",
  2416. "pp",
  2417. "rr",
  2418. "ss",
  2419. "ssz",
  2420. "tt",
  2421. "tty",
  2422. "vv",
  2423. "zz",
  2424. "zzs",
  2425. )
  2426. __step1_suffixes = ("al", "el")
  2427. __step2_suffixes = (
  2428. 'k\xE9ppen',
  2429. 'onk\xE9nt',
  2430. 'enk\xE9nt',
  2431. 'ank\xE9nt',
  2432. 'k\xE9pp',
  2433. 'k\xE9nt',
  2434. 'ban',
  2435. 'ben',
  2436. 'nak',
  2437. 'nek',
  2438. 'val',
  2439. 'vel',
  2440. 't\xF3l',
  2441. 't\xF5l',
  2442. 'r\xF3l',
  2443. 'r\xF5l',
  2444. 'b\xF3l',
  2445. 'b\xF5l',
  2446. 'hoz',
  2447. 'hez',
  2448. 'h\xF6z',
  2449. 'n\xE1l',
  2450. 'n\xE9l',
  2451. '\xE9rt',
  2452. 'kor',
  2453. 'ba',
  2454. 'be',
  2455. 'ra',
  2456. 're',
  2457. 'ig',
  2458. 'at',
  2459. 'et',
  2460. 'ot',
  2461. '\xF6t',
  2462. 'ul',
  2463. '\xFCl',
  2464. 'v\xE1',
  2465. 'v\xE9',
  2466. 'en',
  2467. 'on',
  2468. 'an',
  2469. '\xF6n',
  2470. 'n',
  2471. 't',
  2472. )
  2473. __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n")
  2474. __step4_suffixes = (
  2475. 'astul',
  2476. 'est\xFCl',
  2477. '\xE1stul',
  2478. '\xE9st\xFCl',
  2479. 'stul',
  2480. 'st\xFCl',
  2481. )
  2482. __step5_suffixes = ("\xE1", "\xE9")
  2483. __step6_suffixes = (
  2484. 'ok\xE9',
  2485. '\xF6k\xE9',
  2486. 'ak\xE9',
  2487. 'ek\xE9',
  2488. '\xE1k\xE9',
  2489. '\xE1\xE9i',
  2490. '\xE9k\xE9',
  2491. '\xE9\xE9i',
  2492. 'k\xE9',
  2493. '\xE9i',
  2494. '\xE9\xE9',
  2495. '\xE9',
  2496. )
  2497. __step7_suffixes = (
  2498. '\xE1juk',
  2499. '\xE9j\xFCk',
  2500. '\xFCnk',
  2501. 'unk',
  2502. 'juk',
  2503. 'j\xFCk',
  2504. '\xE1nk',
  2505. '\xE9nk',
  2506. 'nk',
  2507. 'uk',
  2508. '\xFCk',
  2509. 'em',
  2510. 'om',
  2511. 'am',
  2512. 'od',
  2513. 'ed',
  2514. 'ad',
  2515. '\xF6d',
  2516. 'ja',
  2517. 'je',
  2518. '\xE1m',
  2519. '\xE1d',
  2520. '\xE9m',
  2521. '\xE9d',
  2522. 'm',
  2523. 'd',
  2524. 'a',
  2525. 'e',
  2526. 'o',
  2527. '\xE1',
  2528. '\xE9',
  2529. )
  2530. __step8_suffixes = (
  2531. 'jaitok',
  2532. 'jeitek',
  2533. 'jaink',
  2534. 'jeink',
  2535. 'aitok',
  2536. 'eitek',
  2537. '\xE1itok',
  2538. '\xE9itek',
  2539. 'jaim',
  2540. 'jeim',
  2541. 'jaid',
  2542. 'jeid',
  2543. 'eink',
  2544. 'aink',
  2545. 'itek',
  2546. 'jeik',
  2547. 'jaik',
  2548. '\xE1ink',
  2549. '\xE9ink',
  2550. 'aim',
  2551. 'eim',
  2552. 'aid',
  2553. 'eid',
  2554. 'jai',
  2555. 'jei',
  2556. 'ink',
  2557. 'aik',
  2558. 'eik',
  2559. '\xE1im',
  2560. '\xE1id',
  2561. '\xE1ik',
  2562. '\xE9im',
  2563. '\xE9id',
  2564. '\xE9ik',
  2565. 'im',
  2566. 'id',
  2567. 'ai',
  2568. 'ei',
  2569. 'ik',
  2570. '\xE1i',
  2571. '\xE9i',
  2572. 'i',
  2573. )
  2574. __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k")
  2575. def stem(self, word):
  2576. """
  2577. Stem an Hungarian word and return the stemmed form.
  2578. :param word: The word that is stemmed.
  2579. :type word: str or unicode
  2580. :return: The stemmed form.
  2581. :rtype: unicode
  2582. """
  2583. word = word.lower()
  2584. if word in self.stopwords:
  2585. return word
  2586. r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
  2587. # STEP 1: Remove instrumental case
  2588. if r1.endswith(self.__step1_suffixes):
  2589. for double_cons in self.__double_consonants:
  2590. if word[-2 - len(double_cons) : -2] == double_cons:
  2591. word = "".join((word[:-4], word[-3]))
  2592. if r1[-2 - len(double_cons) : -2] == double_cons:
  2593. r1 = "".join((r1[:-4], r1[-3]))
  2594. break
  2595. # STEP 2: Remove frequent cases
  2596. for suffix in self.__step2_suffixes:
  2597. if word.endswith(suffix):
  2598. if r1.endswith(suffix):
  2599. word = word[: -len(suffix)]
  2600. r1 = r1[: -len(suffix)]
  2601. if r1.endswith("\xE1"):
  2602. word = "".join((word[:-1], "a"))
  2603. r1 = suffix_replace(r1, "\xE1", "a")
  2604. elif r1.endswith("\xE9"):
  2605. word = "".join((word[:-1], "e"))
  2606. r1 = suffix_replace(r1, "\xE9", "e")
  2607. break
  2608. # STEP 3: Remove special cases
  2609. for suffix in self.__step3_suffixes:
  2610. if r1.endswith(suffix):
  2611. if suffix == "\xE9n":
  2612. word = suffix_replace(word, suffix, "e")
  2613. r1 = suffix_replace(r1, suffix, "e")
  2614. else:
  2615. word = suffix_replace(word, suffix, "a")
  2616. r1 = suffix_replace(r1, suffix, "a")
  2617. break
  2618. # STEP 4: Remove other cases
  2619. for suffix in self.__step4_suffixes:
  2620. if r1.endswith(suffix):
  2621. if suffix == "\xE1stul":
  2622. word = suffix_replace(word, suffix, "a")
  2623. r1 = suffix_replace(r1, suffix, "a")
  2624. elif suffix == "\xE9st\xFCl":
  2625. word = suffix_replace(word, suffix, "e")
  2626. r1 = suffix_replace(r1, suffix, "e")
  2627. else:
  2628. word = word[: -len(suffix)]
  2629. r1 = r1[: -len(suffix)]
  2630. break
  2631. # STEP 5: Remove factive case
  2632. for suffix in self.__step5_suffixes:
  2633. if r1.endswith(suffix):
  2634. for double_cons in self.__double_consonants:
  2635. if word[-1 - len(double_cons) : -1] == double_cons:
  2636. word = "".join((word[:-3], word[-2]))
  2637. if r1[-1 - len(double_cons) : -1] == double_cons:
  2638. r1 = "".join((r1[:-3], r1[-2]))
  2639. break
  2640. # STEP 6: Remove owned
  2641. for suffix in self.__step6_suffixes:
  2642. if r1.endswith(suffix):
  2643. if suffix in ("\xE1k\xE9", "\xE1\xE9i"):
  2644. word = suffix_replace(word, suffix, "a")
  2645. r1 = suffix_replace(r1, suffix, "a")
  2646. elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"):
  2647. word = suffix_replace(word, suffix, "e")
  2648. r1 = suffix_replace(r1, suffix, "e")
  2649. else:
  2650. word = word[: -len(suffix)]
  2651. r1 = r1[: -len(suffix)]
  2652. break
  2653. # STEP 7: Remove singular owner suffixes
  2654. for suffix in self.__step7_suffixes:
  2655. if word.endswith(suffix):
  2656. if r1.endswith(suffix):
  2657. if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"):
  2658. word = suffix_replace(word, suffix, "a")
  2659. r1 = suffix_replace(r1, suffix, "a")
  2660. elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"):
  2661. word = suffix_replace(word, suffix, "e")
  2662. r1 = suffix_replace(r1, suffix, "e")
  2663. else:
  2664. word = word[: -len(suffix)]
  2665. r1 = r1[: -len(suffix)]
  2666. break
  2667. # STEP 8: Remove plural owner suffixes
  2668. for suffix in self.__step8_suffixes:
  2669. if word.endswith(suffix):
  2670. if r1.endswith(suffix):
  2671. if suffix in (
  2672. "\xE1im",
  2673. "\xE1id",
  2674. "\xE1i",
  2675. "\xE1ink",
  2676. "\xE1itok",
  2677. "\xE1ik",
  2678. ):
  2679. word = suffix_replace(word, suffix, "a")
  2680. r1 = suffix_replace(r1, suffix, "a")
  2681. elif suffix in (
  2682. "\xE9im",
  2683. "\xE9id",
  2684. "\xE9i",
  2685. "\xE9ink",
  2686. "\xE9itek",
  2687. "\xE9ik",
  2688. ):
  2689. word = suffix_replace(word, suffix, "e")
  2690. r1 = suffix_replace(r1, suffix, "e")
  2691. else:
  2692. word = word[: -len(suffix)]
  2693. r1 = r1[: -len(suffix)]
  2694. break
  2695. # STEP 9: Remove plural suffixes
  2696. for suffix in self.__step9_suffixes:
  2697. if word.endswith(suffix):
  2698. if r1.endswith(suffix):
  2699. if suffix == "\xE1k":
  2700. word = suffix_replace(word, suffix, "a")
  2701. elif suffix == "\xE9k":
  2702. word = suffix_replace(word, suffix, "e")
  2703. else:
  2704. word = word[: -len(suffix)]
  2705. break
  2706. return word
  2707. def __r1_hungarian(self, word, vowels, digraphs):
  2708. """
  2709. Return the region R1 that is used by the Hungarian stemmer.
  2710. If the word begins with a vowel, R1 is defined as the region
  2711. after the first consonant or digraph (= two letters stand for
  2712. one phoneme) in the word. If the word begins with a consonant,
  2713. it is defined as the region after the first vowel in the word.
  2714. If the word does not contain both a vowel and consonant, R1
  2715. is the null region at the end of the word.
  2716. :param word: The Hungarian word whose region R1 is determined.
  2717. :type word: str or unicode
  2718. :param vowels: The Hungarian vowels that are used to determine
  2719. the region R1.
  2720. :type vowels: unicode
  2721. :param digraphs: The digraphs that are used to determine the
  2722. region R1.
  2723. :type digraphs: tuple
  2724. :return: the region R1 for the respective word.
  2725. :rtype: unicode
  2726. :note: This helper method is invoked by the stem method of the subclass
  2727. HungarianStemmer. It is not to be invoked directly!
  2728. """
  2729. r1 = ""
  2730. if word[0] in vowels:
  2731. for digraph in digraphs:
  2732. if digraph in word[1:]:
  2733. r1 = word[word.index(digraph[-1]) + 1 :]
  2734. return r1
  2735. for i in range(1, len(word)):
  2736. if word[i] not in vowels:
  2737. r1 = word[i + 1 :]
  2738. break
  2739. else:
  2740. for i in range(1, len(word)):
  2741. if word[i] in vowels:
  2742. r1 = word[i + 1 :]
  2743. break
  2744. return r1
  2745. class ItalianStemmer(_StandardStemmer):
  2746. """
  2747. The Italian Snowball stemmer.
  2748. :cvar __vowels: The Italian vowels.
  2749. :type __vowels: unicode
  2750. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  2751. :type __step0_suffixes: tuple
  2752. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  2753. :type __step1_suffixes: tuple
  2754. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  2755. :type __step2_suffixes: tuple
  2756. :note: A detailed description of the Italian
  2757. stemming algorithm can be found under
  2758. http://snowball.tartarus.org/algorithms/italian/stemmer.html
  2759. """
  2760. __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9"
  2761. __step0_suffixes = (
  2762. 'gliela',
  2763. 'gliele',
  2764. 'glieli',
  2765. 'glielo',
  2766. 'gliene',
  2767. 'sene',
  2768. 'mela',
  2769. 'mele',
  2770. 'meli',
  2771. 'melo',
  2772. 'mene',
  2773. 'tela',
  2774. 'tele',
  2775. 'teli',
  2776. 'telo',
  2777. 'tene',
  2778. 'cela',
  2779. 'cele',
  2780. 'celi',
  2781. 'celo',
  2782. 'cene',
  2783. 'vela',
  2784. 'vele',
  2785. 'veli',
  2786. 'velo',
  2787. 'vene',
  2788. 'gli',
  2789. 'ci',
  2790. 'la',
  2791. 'le',
  2792. 'li',
  2793. 'lo',
  2794. 'mi',
  2795. 'ne',
  2796. 'si',
  2797. 'ti',
  2798. 'vi',
  2799. )
  2800. __step1_suffixes = (
  2801. 'atrice',
  2802. 'atrici',
  2803. 'azione',
  2804. 'azioni',
  2805. 'uzione',
  2806. 'uzioni',
  2807. 'usione',
  2808. 'usioni',
  2809. 'amento',
  2810. 'amenti',
  2811. 'imento',
  2812. 'imenti',
  2813. 'amente',
  2814. 'abile',
  2815. 'abili',
  2816. 'ibile',
  2817. 'ibili',
  2818. 'mente',
  2819. 'atore',
  2820. 'atori',
  2821. 'logia',
  2822. 'logie',
  2823. 'anza',
  2824. 'anze',
  2825. 'iche',
  2826. 'ichi',
  2827. 'ismo',
  2828. 'ismi',
  2829. 'ista',
  2830. 'iste',
  2831. 'isti',
  2832. 'ist\xE0',
  2833. 'ist\xE8',
  2834. 'ist\xEC',
  2835. 'ante',
  2836. 'anti',
  2837. 'enza',
  2838. 'enze',
  2839. 'ico',
  2840. 'ici',
  2841. 'ica',
  2842. 'ice',
  2843. 'oso',
  2844. 'osi',
  2845. 'osa',
  2846. 'ose',
  2847. 'it\xE0',
  2848. 'ivo',
  2849. 'ivi',
  2850. 'iva',
  2851. 'ive',
  2852. )
  2853. __step2_suffixes = (
  2854. 'erebbero',
  2855. 'irebbero',
  2856. 'assero',
  2857. 'assimo',
  2858. 'eranno',
  2859. 'erebbe',
  2860. 'eremmo',
  2861. 'ereste',
  2862. 'eresti',
  2863. 'essero',
  2864. 'iranno',
  2865. 'irebbe',
  2866. 'iremmo',
  2867. 'ireste',
  2868. 'iresti',
  2869. 'iscano',
  2870. 'iscono',
  2871. 'issero',
  2872. 'arono',
  2873. 'avamo',
  2874. 'avano',
  2875. 'avate',
  2876. 'eremo',
  2877. 'erete',
  2878. 'erono',
  2879. 'evamo',
  2880. 'evano',
  2881. 'evate',
  2882. 'iremo',
  2883. 'irete',
  2884. 'irono',
  2885. 'ivamo',
  2886. 'ivano',
  2887. 'ivate',
  2888. 'ammo',
  2889. 'ando',
  2890. 'asse',
  2891. 'assi',
  2892. 'emmo',
  2893. 'enda',
  2894. 'ende',
  2895. 'endi',
  2896. 'endo',
  2897. 'erai',
  2898. 'erei',
  2899. 'Yamo',
  2900. 'iamo',
  2901. 'immo',
  2902. 'irai',
  2903. 'irei',
  2904. 'isca',
  2905. 'isce',
  2906. 'isci',
  2907. 'isco',
  2908. 'ano',
  2909. 'are',
  2910. 'ata',
  2911. 'ate',
  2912. 'ati',
  2913. 'ato',
  2914. 'ava',
  2915. 'avi',
  2916. 'avo',
  2917. 'er\xE0',
  2918. 'ere',
  2919. 'er\xF2',
  2920. 'ete',
  2921. 'eva',
  2922. 'evi',
  2923. 'evo',
  2924. 'ir\xE0',
  2925. 'ire',
  2926. 'ir\xF2',
  2927. 'ita',
  2928. 'ite',
  2929. 'iti',
  2930. 'ito',
  2931. 'iva',
  2932. 'ivi',
  2933. 'ivo',
  2934. 'ono',
  2935. 'uta',
  2936. 'ute',
  2937. 'uti',
  2938. 'uto',
  2939. 'ar',
  2940. 'ir',
  2941. )
  2942. def stem(self, word):
  2943. """
  2944. Stem an Italian word and return the stemmed form.
  2945. :param word: The word that is stemmed.
  2946. :type word: str or unicode
  2947. :return: The stemmed form.
  2948. :rtype: unicode
  2949. """
  2950. word = word.lower()
  2951. if word in self.stopwords:
  2952. return word
  2953. step1_success = False
  2954. # All acute accents are replaced by grave accents.
  2955. word = (
  2956. word.replace("\xE1", "\xE0")
  2957. .replace("\xE9", "\xE8")
  2958. .replace("\xED", "\xEC")
  2959. .replace("\xF3", "\xF2")
  2960. .replace("\xFA", "\xF9")
  2961. )
  2962. # Every occurrence of 'u' after 'q'
  2963. # is put into upper case.
  2964. for i in range(1, len(word)):
  2965. if word[i - 1] == "q" and word[i] == "u":
  2966. word = "".join((word[:i], "U", word[i + 1 :]))
  2967. # Every occurrence of 'u' and 'i'
  2968. # between vowels is put into upper case.
  2969. for i in range(1, len(word) - 1):
  2970. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  2971. if word[i] == "u":
  2972. word = "".join((word[:i], "U", word[i + 1 :]))
  2973. elif word[i] == "i":
  2974. word = "".join((word[:i], "I", word[i + 1 :]))
  2975. r1, r2 = self._r1r2_standard(word, self.__vowels)
  2976. rv = self._rv_standard(word, self.__vowels)
  2977. # STEP 0: Attached pronoun
  2978. for suffix in self.__step0_suffixes:
  2979. if rv.endswith(suffix):
  2980. if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"):
  2981. word = word[: -len(suffix)]
  2982. r1 = r1[: -len(suffix)]
  2983. r2 = r2[: -len(suffix)]
  2984. rv = rv[: -len(suffix)]
  2985. elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"):
  2986. word = suffix_replace(word, suffix, "e")
  2987. r1 = suffix_replace(r1, suffix, "e")
  2988. r2 = suffix_replace(r2, suffix, "e")
  2989. rv = suffix_replace(rv, suffix, "e")
  2990. break
  2991. # STEP 1: Standard suffix removal
  2992. for suffix in self.__step1_suffixes:
  2993. if word.endswith(suffix):
  2994. if suffix == "amente" and r1.endswith(suffix):
  2995. step1_success = True
  2996. word = word[:-6]
  2997. r2 = r2[:-6]
  2998. rv = rv[:-6]
  2999. if r2.endswith("iv"):
  3000. word = word[:-2]
  3001. r2 = r2[:-2]
  3002. rv = rv[:-2]
  3003. if r2.endswith("at"):
  3004. word = word[:-2]
  3005. rv = rv[:-2]
  3006. elif r2.endswith(("os", "ic")):
  3007. word = word[:-2]
  3008. rv = rv[:-2]
  3009. elif r2.endswith("abil"):
  3010. word = word[:-4]
  3011. rv = rv[:-4]
  3012. elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(
  3013. suffix
  3014. ):
  3015. step1_success = True
  3016. word = word[:-6]
  3017. rv = rv[:-6]
  3018. elif r2.endswith(suffix):
  3019. step1_success = True
  3020. if suffix in ("azione", "azioni", "atore", "atori"):
  3021. word = word[: -len(suffix)]
  3022. r2 = r2[: -len(suffix)]
  3023. rv = rv[: -len(suffix)]
  3024. if r2.endswith("ic"):
  3025. word = word[:-2]
  3026. rv = rv[:-2]
  3027. elif suffix in ("logia", "logie"):
  3028. word = word[:-2]
  3029. rv = word[:-2]
  3030. elif suffix in ("uzione", "uzioni", "usione", "usioni"):
  3031. word = word[:-5]
  3032. rv = rv[:-5]
  3033. elif suffix in ("enza", "enze"):
  3034. word = suffix_replace(word, suffix, "te")
  3035. rv = suffix_replace(rv, suffix, "te")
  3036. elif suffix == "it\xE0":
  3037. word = word[:-3]
  3038. r2 = r2[:-3]
  3039. rv = rv[:-3]
  3040. if r2.endswith(("ic", "iv")):
  3041. word = word[:-2]
  3042. rv = rv[:-2]
  3043. elif r2.endswith("abil"):
  3044. word = word[:-4]
  3045. rv = rv[:-4]
  3046. elif suffix in ("ivo", "ivi", "iva", "ive"):
  3047. word = word[:-3]
  3048. r2 = r2[:-3]
  3049. rv = rv[:-3]
  3050. if r2.endswith("at"):
  3051. word = word[:-2]
  3052. r2 = r2[:-2]
  3053. rv = rv[:-2]
  3054. if r2.endswith("ic"):
  3055. word = word[:-2]
  3056. rv = rv[:-2]
  3057. else:
  3058. word = word[: -len(suffix)]
  3059. rv = rv[: -len(suffix)]
  3060. break
  3061. # STEP 2: Verb suffixes
  3062. if not step1_success:
  3063. for suffix in self.__step2_suffixes:
  3064. if rv.endswith(suffix):
  3065. word = word[: -len(suffix)]
  3066. rv = rv[: -len(suffix)]
  3067. break
  3068. # STEP 3a
  3069. if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")):
  3070. word = word[:-1]
  3071. rv = rv[:-1]
  3072. if rv.endswith("i"):
  3073. word = word[:-1]
  3074. rv = rv[:-1]
  3075. # STEP 3b
  3076. if rv.endswith(("ch", "gh")):
  3077. word = word[:-1]
  3078. word = word.replace("I", "i").replace("U", "u")
  3079. return word
  3080. class NorwegianStemmer(_ScandinavianStemmer):
  3081. """
  3082. The Norwegian Snowball stemmer.
  3083. :cvar __vowels: The Norwegian vowels.
  3084. :type __vowels: unicode
  3085. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  3086. :type __s_ending: unicode
  3087. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  3088. :type __step1_suffixes: tuple
  3089. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  3090. :type __step2_suffixes: tuple
  3091. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  3092. :type __step3_suffixes: tuple
  3093. :note: A detailed description of the Norwegian
  3094. stemming algorithm can be found under
  3095. http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
  3096. """
  3097. __vowels = "aeiouy\xE6\xE5\xF8"
  3098. __s_ending = "bcdfghjlmnoprtvyz"
  3099. __step1_suffixes = (
  3100. "hetenes",
  3101. "hetene",
  3102. "hetens",
  3103. "heter",
  3104. "heten",
  3105. "endes",
  3106. "ande",
  3107. "ende",
  3108. "edes",
  3109. "enes",
  3110. "erte",
  3111. "ede",
  3112. "ane",
  3113. "ene",
  3114. "ens",
  3115. "ers",
  3116. "ets",
  3117. "het",
  3118. "ast",
  3119. "ert",
  3120. "en",
  3121. "ar",
  3122. "er",
  3123. "as",
  3124. "es",
  3125. "et",
  3126. "a",
  3127. "e",
  3128. "s",
  3129. )
  3130. __step2_suffixes = ("dt", "vt")
  3131. __step3_suffixes = (
  3132. "hetslov",
  3133. "eleg",
  3134. "elig",
  3135. "elov",
  3136. "slov",
  3137. "leg",
  3138. "eig",
  3139. "lig",
  3140. "els",
  3141. "lov",
  3142. "ig",
  3143. )
  3144. def stem(self, word):
  3145. """
  3146. Stem a Norwegian word and return the stemmed form.
  3147. :param word: The word that is stemmed.
  3148. :type word: str or unicode
  3149. :return: The stemmed form.
  3150. :rtype: unicode
  3151. """
  3152. word = word.lower()
  3153. if word in self.stopwords:
  3154. return word
  3155. r1 = self._r1_scandinavian(word, self.__vowels)
  3156. # STEP 1
  3157. for suffix in self.__step1_suffixes:
  3158. if r1.endswith(suffix):
  3159. if suffix in ("erte", "ert"):
  3160. word = suffix_replace(word, suffix, "er")
  3161. r1 = suffix_replace(r1, suffix, "er")
  3162. elif suffix == "s":
  3163. if word[-2] in self.__s_ending or (
  3164. word[-2] == "k" and word[-3] not in self.__vowels
  3165. ):
  3166. word = word[:-1]
  3167. r1 = r1[:-1]
  3168. else:
  3169. word = word[: -len(suffix)]
  3170. r1 = r1[: -len(suffix)]
  3171. break
  3172. # STEP 2
  3173. for suffix in self.__step2_suffixes:
  3174. if r1.endswith(suffix):
  3175. word = word[:-1]
  3176. r1 = r1[:-1]
  3177. break
  3178. # STEP 3
  3179. for suffix in self.__step3_suffixes:
  3180. if r1.endswith(suffix):
  3181. word = word[: -len(suffix)]
  3182. break
  3183. return word
  3184. class PortugueseStemmer(_StandardStemmer):
  3185. """
  3186. The Portuguese Snowball stemmer.
  3187. :cvar __vowels: The Portuguese vowels.
  3188. :type __vowels: unicode
  3189. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  3190. :type __step1_suffixes: tuple
  3191. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  3192. :type __step2_suffixes: tuple
  3193. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  3194. :type __step4_suffixes: tuple
  3195. :note: A detailed description of the Portuguese
  3196. stemming algorithm can be found under
  3197. http://snowball.tartarus.org/algorithms/portuguese/stemmer.html
  3198. """
  3199. __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4"
  3200. __step1_suffixes = (
  3201. 'amentos',
  3202. 'imentos',
  3203. 'uço~es',
  3204. 'amento',
  3205. 'imento',
  3206. 'adoras',
  3207. 'adores',
  3208. 'a\xE7o~es',
  3209. 'logias',
  3210. '\xEAncias',
  3211. 'amente',
  3212. 'idades',
  3213. 'an\xE7as',
  3214. 'ismos',
  3215. 'istas',
  3216. 'adora',
  3217. 'a\xE7a~o',
  3218. 'antes',
  3219. '\xE2ncia',
  3220. 'logia',
  3221. 'uça~o',
  3222. '\xEAncia',
  3223. 'mente',
  3224. 'idade',
  3225. 'an\xE7a',
  3226. 'ezas',
  3227. 'icos',
  3228. 'icas',
  3229. 'ismo',
  3230. '\xE1vel',
  3231. '\xEDvel',
  3232. 'ista',
  3233. 'osos',
  3234. 'osas',
  3235. 'ador',
  3236. 'ante',
  3237. 'ivas',
  3238. 'ivos',
  3239. 'iras',
  3240. 'eza',
  3241. 'ico',
  3242. 'ica',
  3243. 'oso',
  3244. 'osa',
  3245. 'iva',
  3246. 'ivo',
  3247. 'ira',
  3248. )
  3249. __step2_suffixes = (
  3250. 'ar\xEDamos',
  3251. 'er\xEDamos',
  3252. 'ir\xEDamos',
  3253. '\xE1ssemos',
  3254. '\xEAssemos',
  3255. '\xEDssemos',
  3256. 'ar\xEDeis',
  3257. 'er\xEDeis',
  3258. 'ir\xEDeis',
  3259. '\xE1sseis',
  3260. '\xE9sseis',
  3261. '\xEDsseis',
  3262. '\xE1ramos',
  3263. '\xE9ramos',
  3264. '\xEDramos',
  3265. '\xE1vamos',
  3266. 'aremos',
  3267. 'eremos',
  3268. 'iremos',
  3269. 'ariam',
  3270. 'eriam',
  3271. 'iriam',
  3272. 'assem',
  3273. 'essem',
  3274. 'issem',
  3275. 'ara~o',
  3276. 'era~o',
  3277. 'ira~o',
  3278. 'arias',
  3279. 'erias',
  3280. 'irias',
  3281. 'ardes',
  3282. 'erdes',
  3283. 'irdes',
  3284. 'asses',
  3285. 'esses',
  3286. 'isses',
  3287. 'astes',
  3288. 'estes',
  3289. 'istes',
  3290. '\xE1reis',
  3291. 'areis',
  3292. '\xE9reis',
  3293. 'ereis',
  3294. '\xEDreis',
  3295. 'ireis',
  3296. '\xE1veis',
  3297. '\xEDamos',
  3298. 'armos',
  3299. 'ermos',
  3300. 'irmos',
  3301. 'aria',
  3302. 'eria',
  3303. 'iria',
  3304. 'asse',
  3305. 'esse',
  3306. 'isse',
  3307. 'aste',
  3308. 'este',
  3309. 'iste',
  3310. 'arei',
  3311. 'erei',
  3312. 'irei',
  3313. 'aram',
  3314. 'eram',
  3315. 'iram',
  3316. 'avam',
  3317. 'arem',
  3318. 'erem',
  3319. 'irem',
  3320. 'ando',
  3321. 'endo',
  3322. 'indo',
  3323. 'adas',
  3324. 'idas',
  3325. 'ar\xE1s',
  3326. 'aras',
  3327. 'er\xE1s',
  3328. 'eras',
  3329. 'ir\xE1s',
  3330. 'avas',
  3331. 'ares',
  3332. 'eres',
  3333. 'ires',
  3334. '\xEDeis',
  3335. 'ados',
  3336. 'idos',
  3337. '\xE1mos',
  3338. 'amos',
  3339. 'emos',
  3340. 'imos',
  3341. 'iras',
  3342. 'ada',
  3343. 'ida',
  3344. 'ar\xE1',
  3345. 'ara',
  3346. 'er\xE1',
  3347. 'era',
  3348. 'ir\xE1',
  3349. 'ava',
  3350. 'iam',
  3351. 'ado',
  3352. 'ido',
  3353. 'ias',
  3354. 'ais',
  3355. 'eis',
  3356. 'ira',
  3357. 'ia',
  3358. 'ei',
  3359. 'am',
  3360. 'em',
  3361. 'ar',
  3362. 'er',
  3363. 'ir',
  3364. 'as',
  3365. 'es',
  3366. 'is',
  3367. 'eu',
  3368. 'iu',
  3369. 'ou',
  3370. )
  3371. __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3")
  3372. def stem(self, word):
  3373. """
  3374. Stem a Portuguese word and return the stemmed form.
  3375. :param word: The word that is stemmed.
  3376. :type word: str or unicode
  3377. :return: The stemmed form.
  3378. :rtype: unicode
  3379. """
  3380. word = word.lower()
  3381. if word in self.stopwords:
  3382. return word
  3383. step1_success = False
  3384. step2_success = False
  3385. word = (
  3386. word.replace("\xE3", "a~")
  3387. .replace("\xF5", "o~")
  3388. .replace("q\xFC", "qu")
  3389. .replace("g\xFC", "gu")
  3390. )
  3391. r1, r2 = self._r1r2_standard(word, self.__vowels)
  3392. rv = self._rv_standard(word, self.__vowels)
  3393. # STEP 1: Standard suffix removal
  3394. for suffix in self.__step1_suffixes:
  3395. if word.endswith(suffix):
  3396. if suffix == "amente" and r1.endswith(suffix):
  3397. step1_success = True
  3398. word = word[:-6]
  3399. r2 = r2[:-6]
  3400. rv = rv[:-6]
  3401. if r2.endswith("iv"):
  3402. word = word[:-2]
  3403. r2 = r2[:-2]
  3404. rv = rv[:-2]
  3405. if r2.endswith("at"):
  3406. word = word[:-2]
  3407. rv = rv[:-2]
  3408. elif r2.endswith(("os", "ic", "ad")):
  3409. word = word[:-2]
  3410. rv = rv[:-2]
  3411. elif (
  3412. suffix in ("ira", "iras")
  3413. and rv.endswith(suffix)
  3414. and word[-len(suffix) - 1 : -len(suffix)] == "e"
  3415. ):
  3416. step1_success = True
  3417. word = suffix_replace(word, suffix, "ir")
  3418. rv = suffix_replace(rv, suffix, "ir")
  3419. elif r2.endswith(suffix):
  3420. step1_success = True
  3421. if suffix in ("logia", "logias"):
  3422. word = suffix_replace(word, suffix, "log")
  3423. rv = suffix_replace(rv, suffix, "log")
  3424. elif suffix in ("uça~o", "uço~es"):
  3425. word = suffix_replace(word, suffix, "u")
  3426. rv = suffix_replace(rv, suffix, "u")
  3427. elif suffix in ("\xEAncia", "\xEAncias"):
  3428. word = suffix_replace(word, suffix, "ente")
  3429. rv = suffix_replace(rv, suffix, "ente")
  3430. elif suffix == "mente":
  3431. word = word[:-5]
  3432. r2 = r2[:-5]
  3433. rv = rv[:-5]
  3434. if r2.endswith(("ante", "avel", "ivel")):
  3435. word = word[:-4]
  3436. rv = rv[:-4]
  3437. elif suffix in ("idade", "idades"):
  3438. word = word[: -len(suffix)]
  3439. r2 = r2[: -len(suffix)]
  3440. rv = rv[: -len(suffix)]
  3441. if r2.endswith(("ic", "iv")):
  3442. word = word[:-2]
  3443. rv = rv[:-2]
  3444. elif r2.endswith("abil"):
  3445. word = word[:-4]
  3446. rv = rv[:-4]
  3447. elif suffix in ("iva", "ivo", "ivas", "ivos"):
  3448. word = word[: -len(suffix)]
  3449. r2 = r2[: -len(suffix)]
  3450. rv = rv[: -len(suffix)]
  3451. if r2.endswith("at"):
  3452. word = word[:-2]
  3453. rv = rv[:-2]
  3454. else:
  3455. word = word[: -len(suffix)]
  3456. rv = rv[: -len(suffix)]
  3457. break
  3458. # STEP 2: Verb suffixes
  3459. if not step1_success:
  3460. for suffix in self.__step2_suffixes:
  3461. if rv.endswith(suffix):
  3462. step2_success = True
  3463. word = word[: -len(suffix)]
  3464. rv = rv[: -len(suffix)]
  3465. break
  3466. # STEP 3
  3467. if step1_success or step2_success:
  3468. if rv.endswith("i") and word[-2] == "c":
  3469. word = word[:-1]
  3470. rv = rv[:-1]
  3471. ### STEP 4: Residual suffix
  3472. if not step1_success and not step2_success:
  3473. for suffix in self.__step4_suffixes:
  3474. if rv.endswith(suffix):
  3475. word = word[: -len(suffix)]
  3476. rv = rv[: -len(suffix)]
  3477. break
  3478. # STEP 5
  3479. if rv.endswith(("e", "\xE9", "\xEA")):
  3480. word = word[:-1]
  3481. rv = rv[:-1]
  3482. if (word.endswith("gu") and rv.endswith("u")) or (
  3483. word.endswith("ci") and rv.endswith("i")
  3484. ):
  3485. word = word[:-1]
  3486. elif word.endswith("\xE7"):
  3487. word = suffix_replace(word, "\xE7", "c")
  3488. word = word.replace("a~", "\xE3").replace("o~", "\xF5")
  3489. return word
  3490. class RomanianStemmer(_StandardStemmer):
  3491. """
  3492. The Romanian Snowball stemmer.
  3493. :cvar __vowels: The Romanian vowels.
  3494. :type __vowels: unicode
  3495. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  3496. :type __step0_suffixes: tuple
  3497. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  3498. :type __step1_suffixes: tuple
  3499. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  3500. :type __step2_suffixes: tuple
  3501. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  3502. :type __step3_suffixes: tuple
  3503. :note: A detailed description of the Romanian
  3504. stemming algorithm can be found under
  3505. http://snowball.tartarus.org/algorithms/romanian/stemmer.html
  3506. """
  3507. __vowels = "aeiou\u0103\xE2\xEE"
  3508. __step0_suffixes = (
  3509. 'iilor',
  3510. 'ului',
  3511. 'elor',
  3512. 'iile',
  3513. 'ilor',
  3514. 'atei',
  3515. 'a\u0163ie',
  3516. 'a\u0163ia',
  3517. 'aua',
  3518. 'ele',
  3519. 'iua',
  3520. 'iei',
  3521. 'ile',
  3522. 'ul',
  3523. 'ea',
  3524. 'ii',
  3525. )
  3526. __step1_suffixes = (
  3527. 'abilitate',
  3528. 'abilitati',
  3529. 'abilit\u0103\u0163i',
  3530. 'ibilitate',
  3531. 'abilit\u0103i',
  3532. 'ivitate',
  3533. 'ivitati',
  3534. 'ivit\u0103\u0163i',
  3535. 'icitate',
  3536. 'icitati',
  3537. 'icit\u0103\u0163i',
  3538. 'icatori',
  3539. 'ivit\u0103i',
  3540. 'icit\u0103i',
  3541. 'icator',
  3542. 'a\u0163iune',
  3543. 'atoare',
  3544. '\u0103toare',
  3545. 'i\u0163iune',
  3546. 'itoare',
  3547. 'iciva',
  3548. 'icive',
  3549. 'icivi',
  3550. 'iciv\u0103',
  3551. 'icala',
  3552. 'icale',
  3553. 'icali',
  3554. 'ical\u0103',
  3555. 'ativa',
  3556. 'ative',
  3557. 'ativi',
  3558. 'ativ\u0103',
  3559. 'atori',
  3560. '\u0103tori',
  3561. 'itiva',
  3562. 'itive',
  3563. 'itivi',
  3564. 'itiv\u0103',
  3565. 'itori',
  3566. 'iciv',
  3567. 'ical',
  3568. 'ativ',
  3569. 'ator',
  3570. '\u0103tor',
  3571. 'itiv',
  3572. 'itor',
  3573. )
  3574. __step2_suffixes = (
  3575. 'abila',
  3576. 'abile',
  3577. 'abili',
  3578. 'abil\u0103',
  3579. 'ibila',
  3580. 'ibile',
  3581. 'ibili',
  3582. 'ibil\u0103',
  3583. 'atori',
  3584. 'itate',
  3585. 'itati',
  3586. 'it\u0103\u0163i',
  3587. 'abil',
  3588. 'ibil',
  3589. 'oasa',
  3590. 'oas\u0103',
  3591. 'oase',
  3592. 'anta',
  3593. 'ante',
  3594. 'anti',
  3595. 'ant\u0103',
  3596. 'ator',
  3597. 'it\u0103i',
  3598. 'iune',
  3599. 'iuni',
  3600. 'isme',
  3601. 'ista',
  3602. 'iste',
  3603. 'isti',
  3604. 'ist\u0103',
  3605. 'i\u015Fti',
  3606. 'ata',
  3607. 'at\u0103',
  3608. 'ati',
  3609. 'ate',
  3610. 'uta',
  3611. 'ut\u0103',
  3612. 'uti',
  3613. 'ute',
  3614. 'ita',
  3615. 'it\u0103',
  3616. 'iti',
  3617. 'ite',
  3618. 'ica',
  3619. 'ice',
  3620. 'ici',
  3621. 'ic\u0103',
  3622. 'osi',
  3623. 'o\u015Fi',
  3624. 'ant',
  3625. 'iva',
  3626. 'ive',
  3627. 'ivi',
  3628. 'iv\u0103',
  3629. 'ism',
  3630. 'ist',
  3631. 'at',
  3632. 'ut',
  3633. 'it',
  3634. 'ic',
  3635. 'os',
  3636. 'iv',
  3637. )
  3638. __step3_suffixes = (
  3639. 'seser\u0103\u0163i',
  3640. 'aser\u0103\u0163i',
  3641. 'iser\u0103\u0163i',
  3642. '\xE2ser\u0103\u0163i',
  3643. 'user\u0103\u0163i',
  3644. 'seser\u0103m',
  3645. 'aser\u0103m',
  3646. 'iser\u0103m',
  3647. '\xE2ser\u0103m',
  3648. 'user\u0103m',
  3649. 'ser\u0103\u0163i',
  3650. 'sese\u015Fi',
  3651. 'seser\u0103',
  3652. 'easc\u0103',
  3653. 'ar\u0103\u0163i',
  3654. 'ur\u0103\u0163i',
  3655. 'ir\u0103\u0163i',
  3656. '\xE2r\u0103\u0163i',
  3657. 'ase\u015Fi',
  3658. 'aser\u0103',
  3659. 'ise\u015Fi',
  3660. 'iser\u0103',
  3661. '\xe2se\u015Fi',
  3662. '\xE2ser\u0103',
  3663. 'use\u015Fi',
  3664. 'user\u0103',
  3665. 'ser\u0103m',
  3666. 'sesem',
  3667. 'indu',
  3668. '\xE2ndu',
  3669. 'eaz\u0103',
  3670. 'e\u015Fti',
  3671. 'e\u015Fte',
  3672. '\u0103\u015Fti',
  3673. '\u0103\u015Fte',
  3674. 'ea\u0163i',
  3675. 'ia\u0163i',
  3676. 'ar\u0103m',
  3677. 'ur\u0103m',
  3678. 'ir\u0103m',
  3679. '\xE2r\u0103m',
  3680. 'asem',
  3681. 'isem',
  3682. '\xE2sem',
  3683. 'usem',
  3684. 'se\u015Fi',
  3685. 'ser\u0103',
  3686. 'sese',
  3687. 'are',
  3688. 'ere',
  3689. 'ire',
  3690. '\xE2re',
  3691. 'ind',
  3692. '\xE2nd',
  3693. 'eze',
  3694. 'ezi',
  3695. 'esc',
  3696. '\u0103sc',
  3697. 'eam',
  3698. 'eai',
  3699. 'eau',
  3700. 'iam',
  3701. 'iai',
  3702. 'iau',
  3703. 'a\u015Fi',
  3704. 'ar\u0103',
  3705. 'u\u015Fi',
  3706. 'ur\u0103',
  3707. 'i\u015Fi',
  3708. 'ir\u0103',
  3709. '\xE2\u015Fi',
  3710. '\xe2r\u0103',
  3711. 'ase',
  3712. 'ise',
  3713. '\xE2se',
  3714. 'use',
  3715. 'a\u0163i',
  3716. 'e\u0163i',
  3717. 'i\u0163i',
  3718. '\xe2\u0163i',
  3719. 'sei',
  3720. 'ez',
  3721. 'am',
  3722. 'ai',
  3723. 'au',
  3724. 'ea',
  3725. 'ia',
  3726. 'ui',
  3727. '\xE2i',
  3728. '\u0103m',
  3729. 'em',
  3730. 'im',
  3731. '\xE2m',
  3732. 'se',
  3733. )
  3734. def stem(self, word):
  3735. """
  3736. Stem a Romanian word and return the stemmed form.
  3737. :param word: The word that is stemmed.
  3738. :type word: str or unicode
  3739. :return: The stemmed form.
  3740. :rtype: unicode
  3741. """
  3742. word = word.lower()
  3743. if word in self.stopwords:
  3744. return word
  3745. step1_success = False
  3746. step2_success = False
  3747. for i in range(1, len(word) - 1):
  3748. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  3749. if word[i] == "u":
  3750. word = "".join((word[:i], "U", word[i + 1 :]))
  3751. elif word[i] == "i":
  3752. word = "".join((word[:i], "I", word[i + 1 :]))
  3753. r1, r2 = self._r1r2_standard(word, self.__vowels)
  3754. rv = self._rv_standard(word, self.__vowels)
  3755. # STEP 0: Removal of plurals and other simplifications
  3756. for suffix in self.__step0_suffixes:
  3757. if word.endswith(suffix):
  3758. if suffix in r1:
  3759. if suffix in ("ul", "ului"):
  3760. word = word[: -len(suffix)]
  3761. if suffix in rv:
  3762. rv = rv[: -len(suffix)]
  3763. else:
  3764. rv = ""
  3765. elif (
  3766. suffix == "aua"
  3767. or suffix == "atei"
  3768. or (suffix == "ile" and word[-5:-3] != "ab")
  3769. ):
  3770. word = word[:-2]
  3771. elif suffix in ("ea", "ele", "elor"):
  3772. word = suffix_replace(word, suffix, "e")
  3773. if suffix in rv:
  3774. rv = suffix_replace(rv, suffix, "e")
  3775. else:
  3776. rv = ""
  3777. elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"):
  3778. word = suffix_replace(word, suffix, "i")
  3779. if suffix in rv:
  3780. rv = suffix_replace(rv, suffix, "i")
  3781. else:
  3782. rv = ""
  3783. elif suffix in ("a\u0163ie", "a\u0163ia"):
  3784. word = word[:-1]
  3785. break
  3786. # STEP 1: Reduction of combining suffixes
  3787. while True:
  3788. replacement_done = False
  3789. for suffix in self.__step1_suffixes:
  3790. if word.endswith(suffix):
  3791. if suffix in r1:
  3792. step1_success = True
  3793. replacement_done = True
  3794. if suffix in (
  3795. "abilitate",
  3796. "abilitati",
  3797. "abilit\u0103i",
  3798. "abilit\u0103\u0163i",
  3799. ):
  3800. word = suffix_replace(word, suffix, "abil")
  3801. elif suffix == "ibilitate":
  3802. word = word[:-5]
  3803. elif suffix in (
  3804. "ivitate",
  3805. "ivitati",
  3806. "ivit\u0103i",
  3807. "ivit\u0103\u0163i",
  3808. ):
  3809. word = suffix_replace(word, suffix, "iv")
  3810. elif suffix in (
  3811. "icitate",
  3812. "icitati",
  3813. "icit\u0103i",
  3814. "icit\u0103\u0163i",
  3815. "icator",
  3816. "icatori",
  3817. "iciv",
  3818. "iciva",
  3819. "icive",
  3820. "icivi",
  3821. "iciv\u0103",
  3822. "ical",
  3823. "icala",
  3824. "icale",
  3825. "icali",
  3826. "ical\u0103",
  3827. ):
  3828. word = suffix_replace(word, suffix, "ic")
  3829. elif suffix in (
  3830. "ativ",
  3831. "ativa",
  3832. "ative",
  3833. "ativi",
  3834. "ativ\u0103",
  3835. "a\u0163iune",
  3836. "atoare",
  3837. "ator",
  3838. "atori",
  3839. "\u0103toare",
  3840. "\u0103tor",
  3841. "\u0103tori",
  3842. ):
  3843. word = suffix_replace(word, suffix, "at")
  3844. if suffix in r2:
  3845. r2 = suffix_replace(r2, suffix, "at")
  3846. elif suffix in (
  3847. "itiv",
  3848. "itiva",
  3849. "itive",
  3850. "itivi",
  3851. "itiv\u0103",
  3852. "i\u0163iune",
  3853. "itoare",
  3854. "itor",
  3855. "itori",
  3856. ):
  3857. word = suffix_replace(word, suffix, "it")
  3858. if suffix in r2:
  3859. r2 = suffix_replace(r2, suffix, "it")
  3860. else:
  3861. step1_success = False
  3862. break
  3863. if not replacement_done:
  3864. break
  3865. # STEP 2: Removal of standard suffixes
  3866. for suffix in self.__step2_suffixes:
  3867. if word.endswith(suffix):
  3868. if suffix in r2:
  3869. step2_success = True
  3870. if suffix in ("iune", "iuni"):
  3871. if word[-5] == "\u0163":
  3872. word = "".join((word[:-5], "t"))
  3873. elif suffix in (
  3874. "ism",
  3875. "isme",
  3876. "ist",
  3877. "ista",
  3878. "iste",
  3879. "isti",
  3880. "ist\u0103",
  3881. "i\u015Fti",
  3882. ):
  3883. word = suffix_replace(word, suffix, "ist")
  3884. else:
  3885. word = word[: -len(suffix)]
  3886. break
  3887. # STEP 3: Removal of verb suffixes
  3888. if not step1_success and not step2_success:
  3889. for suffix in self.__step3_suffixes:
  3890. if word.endswith(suffix):
  3891. if suffix in rv:
  3892. if suffix in (
  3893. 'seser\u0103\u0163i',
  3894. 'seser\u0103m',
  3895. 'ser\u0103\u0163i',
  3896. 'sese\u015Fi',
  3897. 'seser\u0103',
  3898. 'ser\u0103m',
  3899. 'sesem',
  3900. 'se\u015Fi',
  3901. 'ser\u0103',
  3902. 'sese',
  3903. 'a\u0163i',
  3904. 'e\u0163i',
  3905. 'i\u0163i',
  3906. '\xE2\u0163i',
  3907. 'sei',
  3908. '\u0103m',
  3909. 'em',
  3910. 'im',
  3911. '\xE2m',
  3912. 'se',
  3913. ):
  3914. word = word[: -len(suffix)]
  3915. rv = rv[: -len(suffix)]
  3916. else:
  3917. if (
  3918. not rv.startswith(suffix)
  3919. and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE"
  3920. ):
  3921. word = word[: -len(suffix)]
  3922. break
  3923. # STEP 4: Removal of final vowel
  3924. for suffix in ("ie", "a", "e", "i", "\u0103"):
  3925. if word.endswith(suffix):
  3926. if suffix in rv:
  3927. word = word[: -len(suffix)]
  3928. break
  3929. word = word.replace("I", "i").replace("U", "u")
  3930. return word
  3931. class RussianStemmer(_LanguageSpecificStemmer):
  3932. """
  3933. The Russian Snowball stemmer.
  3934. :cvar __perfective_gerund_suffixes: Suffixes to be deleted.
  3935. :type __perfective_gerund_suffixes: tuple
  3936. :cvar __adjectival_suffixes: Suffixes to be deleted.
  3937. :type __adjectival_suffixes: tuple
  3938. :cvar __reflexive_suffixes: Suffixes to be deleted.
  3939. :type __reflexive_suffixes: tuple
  3940. :cvar __verb_suffixes: Suffixes to be deleted.
  3941. :type __verb_suffixes: tuple
  3942. :cvar __noun_suffixes: Suffixes to be deleted.
  3943. :type __noun_suffixes: tuple
  3944. :cvar __superlative_suffixes: Suffixes to be deleted.
  3945. :type __superlative_suffixes: tuple
  3946. :cvar __derivational_suffixes: Suffixes to be deleted.
  3947. :type __derivational_suffixes: tuple
  3948. :note: A detailed description of the Russian
  3949. stemming algorithm can be found under
  3950. http://snowball.tartarus.org/algorithms/russian/stemmer.html
  3951. """
  3952. __perfective_gerund_suffixes = (
  3953. "ivshis'",
  3954. "yvshis'",
  3955. "vshis'",
  3956. "ivshi",
  3957. "yvshi",
  3958. "vshi",
  3959. "iv",
  3960. "yv",
  3961. "v",
  3962. )
  3963. __adjectival_suffixes = (
  3964. 'ui^ushchi^ui^u',
  3965. 'ui^ushchi^ai^a',
  3966. 'ui^ushchimi',
  3967. 'ui^ushchymi',
  3968. 'ui^ushchego',
  3969. 'ui^ushchogo',
  3970. 'ui^ushchemu',
  3971. 'ui^ushchomu',
  3972. 'ui^ushchikh',
  3973. 'ui^ushchykh',
  3974. 'ui^ushchui^u',
  3975. 'ui^ushchaia',
  3976. 'ui^ushchoi^u',
  3977. 'ui^ushchei^u',
  3978. 'i^ushchi^ui^u',
  3979. 'i^ushchi^ai^a',
  3980. 'ui^ushchee',
  3981. 'ui^ushchie',
  3982. 'ui^ushchye',
  3983. 'ui^ushchoe',
  3984. 'ui^ushchei`',
  3985. 'ui^ushchii`',
  3986. 'ui^ushchyi`',
  3987. 'ui^ushchoi`',
  3988. 'ui^ushchem',
  3989. 'ui^ushchim',
  3990. 'ui^ushchym',
  3991. 'ui^ushchom',
  3992. 'i^ushchimi',
  3993. 'i^ushchymi',
  3994. 'i^ushchego',
  3995. 'i^ushchogo',
  3996. 'i^ushchemu',
  3997. 'i^ushchomu',
  3998. 'i^ushchikh',
  3999. 'i^ushchykh',
  4000. 'i^ushchui^u',
  4001. 'i^ushchai^a',
  4002. 'i^ushchoi^u',
  4003. 'i^ushchei^u',
  4004. 'i^ushchee',
  4005. 'i^ushchie',
  4006. 'i^ushchye',
  4007. 'i^ushchoe',
  4008. 'i^ushchei`',
  4009. 'i^ushchii`',
  4010. 'i^ushchyi`',
  4011. 'i^ushchoi`',
  4012. 'i^ushchem',
  4013. 'i^ushchim',
  4014. 'i^ushchym',
  4015. 'i^ushchom',
  4016. 'shchi^ui^u',
  4017. 'shchi^ai^a',
  4018. 'ivshi^ui^u',
  4019. 'ivshi^ai^a',
  4020. 'yvshi^ui^u',
  4021. 'yvshi^ai^a',
  4022. 'shchimi',
  4023. 'shchymi',
  4024. 'shchego',
  4025. 'shchogo',
  4026. 'shchemu',
  4027. 'shchomu',
  4028. 'shchikh',
  4029. 'shchykh',
  4030. 'shchui^u',
  4031. 'shchai^a',
  4032. 'shchoi^u',
  4033. 'shchei^u',
  4034. 'ivshimi',
  4035. 'ivshymi',
  4036. 'ivshego',
  4037. 'ivshogo',
  4038. 'ivshemu',
  4039. 'ivshomu',
  4040. 'ivshikh',
  4041. 'ivshykh',
  4042. 'ivshui^u',
  4043. 'ivshai^a',
  4044. 'ivshoi^u',
  4045. 'ivshei^u',
  4046. 'yvshimi',
  4047. 'yvshymi',
  4048. 'yvshego',
  4049. 'yvshogo',
  4050. 'yvshemu',
  4051. 'yvshomu',
  4052. 'yvshikh',
  4053. 'yvshykh',
  4054. 'yvshui^u',
  4055. 'yvshai^a',
  4056. 'yvshoi^u',
  4057. 'yvshei^u',
  4058. 'vshi^ui^u',
  4059. 'vshi^ai^a',
  4060. 'shchee',
  4061. 'shchie',
  4062. 'shchye',
  4063. 'shchoe',
  4064. 'shchei`',
  4065. 'shchii`',
  4066. 'shchyi`',
  4067. 'shchoi`',
  4068. 'shchem',
  4069. 'shchim',
  4070. 'shchym',
  4071. 'shchom',
  4072. 'ivshee',
  4073. 'ivshie',
  4074. 'ivshye',
  4075. 'ivshoe',
  4076. 'ivshei`',
  4077. 'ivshii`',
  4078. 'ivshyi`',
  4079. 'ivshoi`',
  4080. 'ivshem',
  4081. 'ivshim',
  4082. 'ivshym',
  4083. 'ivshom',
  4084. 'yvshee',
  4085. 'yvshie',
  4086. 'yvshye',
  4087. 'yvshoe',
  4088. 'yvshei`',
  4089. 'yvshii`',
  4090. 'yvshyi`',
  4091. 'yvshoi`',
  4092. 'yvshem',
  4093. 'yvshim',
  4094. 'yvshym',
  4095. 'yvshom',
  4096. 'vshimi',
  4097. 'vshymi',
  4098. 'vshego',
  4099. 'vshogo',
  4100. 'vshemu',
  4101. 'vshomu',
  4102. 'vshikh',
  4103. 'vshykh',
  4104. 'vshui^u',
  4105. 'vshai^a',
  4106. 'vshoi^u',
  4107. 'vshei^u',
  4108. 'emi^ui^u',
  4109. 'emi^ai^a',
  4110. 'nni^ui^u',
  4111. 'nni^ai^a',
  4112. 'vshee',
  4113. 'vshie',
  4114. 'vshye',
  4115. 'vshoe',
  4116. 'vshei`',
  4117. 'vshii`',
  4118. 'vshyi`',
  4119. 'vshoi`',
  4120. 'vshem',
  4121. 'vshim',
  4122. 'vshym',
  4123. 'vshom',
  4124. 'emimi',
  4125. 'emymi',
  4126. 'emego',
  4127. 'emogo',
  4128. 'ememu',
  4129. 'emomu',
  4130. 'emikh',
  4131. 'emykh',
  4132. 'emui^u',
  4133. 'emai^a',
  4134. 'emoi^u',
  4135. 'emei^u',
  4136. 'nnimi',
  4137. 'nnymi',
  4138. 'nnego',
  4139. 'nnogo',
  4140. 'nnemu',
  4141. 'nnomu',
  4142. 'nnikh',
  4143. 'nnykh',
  4144. 'nnui^u',
  4145. 'nnai^a',
  4146. 'nnoi^u',
  4147. 'nnei^u',
  4148. 'emee',
  4149. 'emie',
  4150. 'emye',
  4151. 'emoe',
  4152. 'emei`',
  4153. 'emii`',
  4154. 'emyi`',
  4155. 'emoi`',
  4156. 'emem',
  4157. 'emim',
  4158. 'emym',
  4159. 'emom',
  4160. 'nnee',
  4161. 'nnie',
  4162. 'nnye',
  4163. 'nnoe',
  4164. 'nnei`',
  4165. 'nnii`',
  4166. 'nnyi`',
  4167. 'nnoi`',
  4168. 'nnem',
  4169. 'nnim',
  4170. 'nnym',
  4171. 'nnom',
  4172. 'i^ui^u',
  4173. 'i^ai^a',
  4174. 'imi',
  4175. 'ymi',
  4176. 'ego',
  4177. 'ogo',
  4178. 'emu',
  4179. 'omu',
  4180. 'ikh',
  4181. 'ykh',
  4182. 'ui^u',
  4183. 'ai^a',
  4184. 'oi^u',
  4185. 'ei^u',
  4186. 'ee',
  4187. 'ie',
  4188. 'ye',
  4189. 'oe',
  4190. 'ei`',
  4191. 'ii`',
  4192. 'yi`',
  4193. 'oi`',
  4194. 'em',
  4195. 'im',
  4196. 'ym',
  4197. 'om',
  4198. )
  4199. __reflexive_suffixes = ("si^a", "s'")
  4200. __verb_suffixes = (
  4201. "esh'",
  4202. 'ei`te',
  4203. 'ui`te',
  4204. 'ui^ut',
  4205. "ish'",
  4206. 'ete',
  4207. 'i`te',
  4208. 'i^ut',
  4209. 'nno',
  4210. 'ila',
  4211. 'yla',
  4212. 'ena',
  4213. 'ite',
  4214. 'ili',
  4215. 'yli',
  4216. 'ilo',
  4217. 'ylo',
  4218. 'eno',
  4219. 'i^at',
  4220. 'uet',
  4221. 'eny',
  4222. "it'",
  4223. "yt'",
  4224. 'ui^u',
  4225. 'la',
  4226. 'na',
  4227. 'li',
  4228. 'em',
  4229. 'lo',
  4230. 'no',
  4231. 'et',
  4232. 'ny',
  4233. "t'",
  4234. 'ei`',
  4235. 'ui`',
  4236. 'il',
  4237. 'yl',
  4238. 'im',
  4239. 'ym',
  4240. 'en',
  4241. 'it',
  4242. 'yt',
  4243. 'i^u',
  4244. 'i`',
  4245. 'l',
  4246. 'n',
  4247. )
  4248. __noun_suffixes = (
  4249. 'ii^ami',
  4250. 'ii^akh',
  4251. 'i^ami',
  4252. 'ii^am',
  4253. 'i^akh',
  4254. 'ami',
  4255. 'iei`',
  4256. 'i^am',
  4257. 'iem',
  4258. 'akh',
  4259. 'ii^u',
  4260. "'i^u",
  4261. 'ii^a',
  4262. "'i^a",
  4263. 'ev',
  4264. 'ov',
  4265. 'ie',
  4266. "'e",
  4267. 'ei',
  4268. 'ii',
  4269. 'ei`',
  4270. 'oi`',
  4271. 'ii`',
  4272. 'em',
  4273. 'am',
  4274. 'om',
  4275. 'i^u',
  4276. 'i^a',
  4277. 'a',
  4278. 'e',
  4279. 'i',
  4280. 'i`',
  4281. 'o',
  4282. 'u',
  4283. 'y',
  4284. "'",
  4285. )
  4286. __superlative_suffixes = ("ei`she", "ei`sh")
  4287. __derivational_suffixes = ("ost'", "ost")
  4288. def stem(self, word):
  4289. """
  4290. Stem a Russian word and return the stemmed form.
  4291. :param word: The word that is stemmed.
  4292. :type word: str or unicode
  4293. :return: The stemmed form.
  4294. :rtype: unicode
  4295. """
  4296. if word in self.stopwords:
  4297. return word
  4298. chr_exceeded = False
  4299. for i in range(len(word)):
  4300. if ord(word[i]) > 255:
  4301. chr_exceeded = True
  4302. break
  4303. if not chr_exceeded:
  4304. return word
  4305. word = self.__cyrillic_to_roman(word)
  4306. step1_success = False
  4307. adjectival_removed = False
  4308. verb_removed = False
  4309. undouble_success = False
  4310. superlative_removed = False
  4311. rv, r2 = self.__regions_russian(word)
  4312. # Step 1
  4313. for suffix in self.__perfective_gerund_suffixes:
  4314. if rv.endswith(suffix):
  4315. if suffix in ("v", "vshi", "vshis'"):
  4316. if (
  4317. rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
  4318. or rv[-len(suffix) - 1 : -len(suffix)] == "a"
  4319. ):
  4320. word = word[: -len(suffix)]
  4321. r2 = r2[: -len(suffix)]
  4322. rv = rv[: -len(suffix)]
  4323. step1_success = True
  4324. break
  4325. else:
  4326. word = word[: -len(suffix)]
  4327. r2 = r2[: -len(suffix)]
  4328. rv = rv[: -len(suffix)]
  4329. step1_success = True
  4330. break
  4331. if not step1_success:
  4332. for suffix in self.__reflexive_suffixes:
  4333. if rv.endswith(suffix):
  4334. word = word[: -len(suffix)]
  4335. r2 = r2[: -len(suffix)]
  4336. rv = rv[: -len(suffix)]
  4337. break
  4338. for suffix in self.__adjectival_suffixes:
  4339. if rv.endswith(suffix):
  4340. if suffix in (
  4341. 'i^ushchi^ui^u',
  4342. 'i^ushchi^ai^a',
  4343. 'i^ushchui^u',
  4344. 'i^ushchai^a',
  4345. 'i^ushchoi^u',
  4346. 'i^ushchei^u',
  4347. 'i^ushchimi',
  4348. 'i^ushchymi',
  4349. 'i^ushchego',
  4350. 'i^ushchogo',
  4351. 'i^ushchemu',
  4352. 'i^ushchomu',
  4353. 'i^ushchikh',
  4354. 'i^ushchykh',
  4355. 'shchi^ui^u',
  4356. 'shchi^ai^a',
  4357. 'i^ushchee',
  4358. 'i^ushchie',
  4359. 'i^ushchye',
  4360. 'i^ushchoe',
  4361. 'i^ushchei`',
  4362. 'i^ushchii`',
  4363. 'i^ushchyi`',
  4364. 'i^ushchoi`',
  4365. 'i^ushchem',
  4366. 'i^ushchim',
  4367. 'i^ushchym',
  4368. 'i^ushchom',
  4369. 'vshi^ui^u',
  4370. 'vshi^ai^a',
  4371. 'shchui^u',
  4372. 'shchai^a',
  4373. 'shchoi^u',
  4374. 'shchei^u',
  4375. 'emi^ui^u',
  4376. 'emi^ai^a',
  4377. 'nni^ui^u',
  4378. 'nni^ai^a',
  4379. 'shchimi',
  4380. 'shchymi',
  4381. 'shchego',
  4382. 'shchogo',
  4383. 'shchemu',
  4384. 'shchomu',
  4385. 'shchikh',
  4386. 'shchykh',
  4387. 'vshui^u',
  4388. 'vshai^a',
  4389. 'vshoi^u',
  4390. 'vshei^u',
  4391. 'shchee',
  4392. 'shchie',
  4393. 'shchye',
  4394. 'shchoe',
  4395. 'shchei`',
  4396. 'shchii`',
  4397. 'shchyi`',
  4398. 'shchoi`',
  4399. 'shchem',
  4400. 'shchim',
  4401. 'shchym',
  4402. 'shchom',
  4403. 'vshimi',
  4404. 'vshymi',
  4405. 'vshego',
  4406. 'vshogo',
  4407. 'vshemu',
  4408. 'vshomu',
  4409. 'vshikh',
  4410. 'vshykh',
  4411. 'emui^u',
  4412. 'emai^a',
  4413. 'emoi^u',
  4414. 'emei^u',
  4415. 'nnui^u',
  4416. 'nnai^a',
  4417. 'nnoi^u',
  4418. 'nnei^u',
  4419. 'vshee',
  4420. 'vshie',
  4421. 'vshye',
  4422. 'vshoe',
  4423. 'vshei`',
  4424. 'vshii`',
  4425. 'vshyi`',
  4426. 'vshoi`',
  4427. 'vshem',
  4428. 'vshim',
  4429. 'vshym',
  4430. 'vshom',
  4431. 'emimi',
  4432. 'emymi',
  4433. 'emego',
  4434. 'emogo',
  4435. 'ememu',
  4436. 'emomu',
  4437. 'emikh',
  4438. 'emykh',
  4439. 'nnimi',
  4440. 'nnymi',
  4441. 'nnego',
  4442. 'nnogo',
  4443. 'nnemu',
  4444. 'nnomu',
  4445. 'nnikh',
  4446. 'nnykh',
  4447. 'emee',
  4448. 'emie',
  4449. 'emye',
  4450. 'emoe',
  4451. 'emei`',
  4452. 'emii`',
  4453. 'emyi`',
  4454. 'emoi`',
  4455. 'emem',
  4456. 'emim',
  4457. 'emym',
  4458. 'emom',
  4459. 'nnee',
  4460. 'nnie',
  4461. 'nnye',
  4462. 'nnoe',
  4463. 'nnei`',
  4464. 'nnii`',
  4465. 'nnyi`',
  4466. 'nnoi`',
  4467. 'nnem',
  4468. 'nnim',
  4469. 'nnym',
  4470. 'nnom',
  4471. ):
  4472. if (
  4473. rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
  4474. or rv[-len(suffix) - 1 : -len(suffix)] == "a"
  4475. ):
  4476. word = word[: -len(suffix)]
  4477. r2 = r2[: -len(suffix)]
  4478. rv = rv[: -len(suffix)]
  4479. adjectival_removed = True
  4480. break
  4481. else:
  4482. word = word[: -len(suffix)]
  4483. r2 = r2[: -len(suffix)]
  4484. rv = rv[: -len(suffix)]
  4485. adjectival_removed = True
  4486. break
  4487. if not adjectival_removed:
  4488. for suffix in self.__verb_suffixes:
  4489. if rv.endswith(suffix):
  4490. if suffix in (
  4491. "la",
  4492. "na",
  4493. "ete",
  4494. "i`te",
  4495. "li",
  4496. "i`",
  4497. "l",
  4498. "em",
  4499. "n",
  4500. "lo",
  4501. "no",
  4502. "et",
  4503. "i^ut",
  4504. "ny",
  4505. "t'",
  4506. "esh'",
  4507. "nno",
  4508. ):
  4509. if (
  4510. rv[-len(suffix) - 3 : -len(suffix)] == "i^a"
  4511. or rv[-len(suffix) - 1 : -len(suffix)] == "a"
  4512. ):
  4513. word = word[: -len(suffix)]
  4514. r2 = r2[: -len(suffix)]
  4515. rv = rv[: -len(suffix)]
  4516. verb_removed = True
  4517. break
  4518. else:
  4519. word = word[: -len(suffix)]
  4520. r2 = r2[: -len(suffix)]
  4521. rv = rv[: -len(suffix)]
  4522. verb_removed = True
  4523. break
  4524. if not adjectival_removed and not verb_removed:
  4525. for suffix in self.__noun_suffixes:
  4526. if rv.endswith(suffix):
  4527. word = word[: -len(suffix)]
  4528. r2 = r2[: -len(suffix)]
  4529. rv = rv[: -len(suffix)]
  4530. break
  4531. # Step 2
  4532. if rv.endswith("i"):
  4533. word = word[:-1]
  4534. r2 = r2[:-1]
  4535. # Step 3
  4536. for suffix in self.__derivational_suffixes:
  4537. if r2.endswith(suffix):
  4538. word = word[: -len(suffix)]
  4539. break
  4540. # Step 4
  4541. if word.endswith("nn"):
  4542. word = word[:-1]
  4543. undouble_success = True
  4544. if not undouble_success:
  4545. for suffix in self.__superlative_suffixes:
  4546. if word.endswith(suffix):
  4547. word = word[: -len(suffix)]
  4548. superlative_removed = True
  4549. break
  4550. if word.endswith("nn"):
  4551. word = word[:-1]
  4552. if not undouble_success and not superlative_removed:
  4553. if word.endswith("'"):
  4554. word = word[:-1]
  4555. word = self.__roman_to_cyrillic(word)
  4556. return word
  4557. def __regions_russian(self, word):
  4558. """
  4559. Return the regions RV and R2 which are used by the Russian stemmer.
  4560. In any word, RV is the region after the first vowel,
  4561. or the end of the word if it contains no vowel.
  4562. R2 is the region after the first non-vowel following
  4563. a vowel in R1, or the end of the word if there is no such non-vowel.
  4564. R1 is the region after the first non-vowel following a vowel,
  4565. or the end of the word if there is no such non-vowel.
  4566. :param word: The Russian word whose regions RV and R2 are determined.
  4567. :type word: str or unicode
  4568. :return: the regions RV and R2 for the respective Russian word.
  4569. :rtype: tuple
  4570. :note: This helper method is invoked by the stem method of the subclass
  4571. RussianStemmer. It is not to be invoked directly!
  4572. """
  4573. r1 = ""
  4574. r2 = ""
  4575. rv = ""
  4576. vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
  4577. word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E")
  4578. for i in range(1, len(word)):
  4579. if word[i] not in vowels and word[i - 1] in vowels:
  4580. r1 = word[i + 1 :]
  4581. break
  4582. for i in range(1, len(r1)):
  4583. if r1[i] not in vowels and r1[i - 1] in vowels:
  4584. r2 = r1[i + 1 :]
  4585. break
  4586. for i in range(len(word)):
  4587. if word[i] in vowels:
  4588. rv = word[i + 1 :]
  4589. break
  4590. r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
  4591. rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`")
  4592. return (rv, r2)
  4593. def __cyrillic_to_roman(self, word):
  4594. """
  4595. Transliterate a Russian word into the Roman alphabet.
  4596. A Russian word whose letters consist of the Cyrillic
  4597. alphabet are transliterated into the Roman alphabet
  4598. in order to ease the forthcoming stemming process.
  4599. :param word: The word that is transliterated.
  4600. :type word: unicode
  4601. :return: the transliterated word.
  4602. :rtype: unicode
  4603. :note: This helper method is invoked by the stem method of the subclass
  4604. RussianStemmer. It is not to be invoked directly!
  4605. """
  4606. word = (
  4607. word.replace("\u0410", "a")
  4608. .replace("\u0430", "a")
  4609. .replace("\u0411", "b")
  4610. .replace("\u0431", "b")
  4611. .replace("\u0412", "v")
  4612. .replace("\u0432", "v")
  4613. .replace("\u0413", "g")
  4614. .replace("\u0433", "g")
  4615. .replace("\u0414", "d")
  4616. .replace("\u0434", "d")
  4617. .replace("\u0415", "e")
  4618. .replace("\u0435", "e")
  4619. .replace("\u0401", "e")
  4620. .replace("\u0451", "e")
  4621. .replace("\u0416", "zh")
  4622. .replace("\u0436", "zh")
  4623. .replace("\u0417", "z")
  4624. .replace("\u0437", "z")
  4625. .replace("\u0418", "i")
  4626. .replace("\u0438", "i")
  4627. .replace("\u0419", "i`")
  4628. .replace("\u0439", "i`")
  4629. .replace("\u041A", "k")
  4630. .replace("\u043A", "k")
  4631. .replace("\u041B", "l")
  4632. .replace("\u043B", "l")
  4633. .replace("\u041C", "m")
  4634. .replace("\u043C", "m")
  4635. .replace("\u041D", "n")
  4636. .replace("\u043D", "n")
  4637. .replace("\u041E", "o")
  4638. .replace("\u043E", "o")
  4639. .replace("\u041F", "p")
  4640. .replace("\u043F", "p")
  4641. .replace("\u0420", "r")
  4642. .replace("\u0440", "r")
  4643. .replace("\u0421", "s")
  4644. .replace("\u0441", "s")
  4645. .replace("\u0422", "t")
  4646. .replace("\u0442", "t")
  4647. .replace("\u0423", "u")
  4648. .replace("\u0443", "u")
  4649. .replace("\u0424", "f")
  4650. .replace("\u0444", "f")
  4651. .replace("\u0425", "kh")
  4652. .replace("\u0445", "kh")
  4653. .replace("\u0426", "t^s")
  4654. .replace("\u0446", "t^s")
  4655. .replace("\u0427", "ch")
  4656. .replace("\u0447", "ch")
  4657. .replace("\u0428", "sh")
  4658. .replace("\u0448", "sh")
  4659. .replace("\u0429", "shch")
  4660. .replace("\u0449", "shch")
  4661. .replace("\u042A", "''")
  4662. .replace("\u044A", "''")
  4663. .replace("\u042B", "y")
  4664. .replace("\u044B", "y")
  4665. .replace("\u042C", "'")
  4666. .replace("\u044C", "'")
  4667. .replace("\u042D", "e`")
  4668. .replace("\u044D", "e`")
  4669. .replace("\u042E", "i^u")
  4670. .replace("\u044E", "i^u")
  4671. .replace("\u042F", "i^a")
  4672. .replace("\u044F", "i^a")
  4673. )
  4674. return word
  4675. def __roman_to_cyrillic(self, word):
  4676. """
  4677. Transliterate a Russian word back into the Cyrillic alphabet.
  4678. A Russian word formerly transliterated into the Roman alphabet
  4679. in order to ease the stemming process, is transliterated back
  4680. into the Cyrillic alphabet, its original form.
  4681. :param word: The word that is transliterated.
  4682. :type word: str or unicode
  4683. :return: word, the transliterated word.
  4684. :rtype: unicode
  4685. :note: This helper method is invoked by the stem method of the subclass
  4686. RussianStemmer. It is not to be invoked directly!
  4687. """
  4688. word = (
  4689. word.replace("i^u", "\u044E")
  4690. .replace("i^a", "\u044F")
  4691. .replace("shch", "\u0449")
  4692. .replace("kh", "\u0445")
  4693. .replace("t^s", "\u0446")
  4694. .replace("ch", "\u0447")
  4695. .replace("e`", "\u044D")
  4696. .replace("i`", "\u0439")
  4697. .replace("sh", "\u0448")
  4698. .replace("k", "\u043A")
  4699. .replace("e", "\u0435")
  4700. .replace("zh", "\u0436")
  4701. .replace("a", "\u0430")
  4702. .replace("b", "\u0431")
  4703. .replace("v", "\u0432")
  4704. .replace("g", "\u0433")
  4705. .replace("d", "\u0434")
  4706. .replace("e", "\u0435")
  4707. .replace("z", "\u0437")
  4708. .replace("i", "\u0438")
  4709. .replace("l", "\u043B")
  4710. .replace("m", "\u043C")
  4711. .replace("n", "\u043D")
  4712. .replace("o", "\u043E")
  4713. .replace("p", "\u043F")
  4714. .replace("r", "\u0440")
  4715. .replace("s", "\u0441")
  4716. .replace("t", "\u0442")
  4717. .replace("u", "\u0443")
  4718. .replace("f", "\u0444")
  4719. .replace("''", "\u044A")
  4720. .replace("y", "\u044B")
  4721. .replace("'", "\u044C")
  4722. )
  4723. return word
  4724. class SpanishStemmer(_StandardStemmer):
  4725. """
  4726. The Spanish Snowball stemmer.
  4727. :cvar __vowels: The Spanish vowels.
  4728. :type __vowels: unicode
  4729. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  4730. :type __step0_suffixes: tuple
  4731. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  4732. :type __step1_suffixes: tuple
  4733. :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
  4734. :type __step2a_suffixes: tuple
  4735. :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
  4736. :type __step2b_suffixes: tuple
  4737. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  4738. :type __step3_suffixes: tuple
  4739. :note: A detailed description of the Spanish
  4740. stemming algorithm can be found under
  4741. http://snowball.tartarus.org/algorithms/spanish/stemmer.html
  4742. """
  4743. __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC"
  4744. __step0_suffixes = (
  4745. "selas",
  4746. "selos",
  4747. "sela",
  4748. "selo",
  4749. "las",
  4750. "les",
  4751. "los",
  4752. "nos",
  4753. "me",
  4754. "se",
  4755. "la",
  4756. "le",
  4757. "lo",
  4758. )
  4759. __step1_suffixes = (
  4760. 'amientos',
  4761. 'imientos',
  4762. 'amiento',
  4763. 'imiento',
  4764. 'aciones',
  4765. 'uciones',
  4766. 'adoras',
  4767. 'adores',
  4768. 'ancias',
  4769. 'log\xEDas',
  4770. 'encias',
  4771. 'amente',
  4772. 'idades',
  4773. 'anzas',
  4774. 'ismos',
  4775. 'ables',
  4776. 'ibles',
  4777. 'istas',
  4778. 'adora',
  4779. 'aci\xF3n',
  4780. 'antes',
  4781. 'ancia',
  4782. 'log\xEDa',
  4783. 'uci\xf3n',
  4784. 'encia',
  4785. 'mente',
  4786. 'anza',
  4787. 'icos',
  4788. 'icas',
  4789. 'ismo',
  4790. 'able',
  4791. 'ible',
  4792. 'ista',
  4793. 'osos',
  4794. 'osas',
  4795. 'ador',
  4796. 'ante',
  4797. 'idad',
  4798. 'ivas',
  4799. 'ivos',
  4800. 'ico',
  4801. 'ica',
  4802. 'oso',
  4803. 'osa',
  4804. 'iva',
  4805. 'ivo',
  4806. )
  4807. __step2a_suffixes = (
  4808. 'yeron',
  4809. 'yendo',
  4810. 'yamos',
  4811. 'yais',
  4812. 'yan',
  4813. 'yen',
  4814. 'yas',
  4815. 'yes',
  4816. 'ya',
  4817. 'ye',
  4818. 'yo',
  4819. 'y\xF3',
  4820. )
  4821. __step2b_suffixes = (
  4822. 'ar\xEDamos',
  4823. 'er\xEDamos',
  4824. 'ir\xEDamos',
  4825. 'i\xE9ramos',
  4826. 'i\xE9semos',
  4827. 'ar\xEDais',
  4828. 'aremos',
  4829. 'er\xEDais',
  4830. 'eremos',
  4831. 'ir\xEDais',
  4832. 'iremos',
  4833. 'ierais',
  4834. 'ieseis',
  4835. 'asteis',
  4836. 'isteis',
  4837. '\xE1bamos',
  4838. '\xE1ramos',
  4839. '\xE1semos',
  4840. 'ar\xEDan',
  4841. 'ar\xEDas',
  4842. 'ar\xE9is',
  4843. 'er\xEDan',
  4844. 'er\xEDas',
  4845. 'er\xE9is',
  4846. 'ir\xEDan',
  4847. 'ir\xEDas',
  4848. 'ir\xE9is',
  4849. 'ieran',
  4850. 'iesen',
  4851. 'ieron',
  4852. 'iendo',
  4853. 'ieras',
  4854. 'ieses',
  4855. 'abais',
  4856. 'arais',
  4857. 'aseis',
  4858. '\xE9amos',
  4859. 'ar\xE1n',
  4860. 'ar\xE1s',
  4861. 'ar\xEDa',
  4862. 'er\xE1n',
  4863. 'er\xE1s',
  4864. 'er\xEDa',
  4865. 'ir\xE1n',
  4866. 'ir\xE1s',
  4867. 'ir\xEDa',
  4868. 'iera',
  4869. 'iese',
  4870. 'aste',
  4871. 'iste',
  4872. 'aban',
  4873. 'aran',
  4874. 'asen',
  4875. 'aron',
  4876. 'ando',
  4877. 'abas',
  4878. 'adas',
  4879. 'idas',
  4880. 'aras',
  4881. 'ases',
  4882. '\xEDais',
  4883. 'ados',
  4884. 'idos',
  4885. 'amos',
  4886. 'imos',
  4887. 'emos',
  4888. 'ar\xE1',
  4889. 'ar\xE9',
  4890. 'er\xE1',
  4891. 'er\xE9',
  4892. 'ir\xE1',
  4893. 'ir\xE9',
  4894. 'aba',
  4895. 'ada',
  4896. 'ida',
  4897. 'ara',
  4898. 'ase',
  4899. '\xEDan',
  4900. 'ado',
  4901. 'ido',
  4902. '\xEDas',
  4903. '\xE1is',
  4904. '\xE9is',
  4905. '\xEDa',
  4906. 'ad',
  4907. 'ed',
  4908. 'id',
  4909. 'an',
  4910. 'i\xF3',
  4911. 'ar',
  4912. 'er',
  4913. 'ir',
  4914. 'as',
  4915. '\xEDs',
  4916. 'en',
  4917. 'es',
  4918. )
  4919. __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3")
  4920. def stem(self, word):
  4921. """
  4922. Stem a Spanish word and return the stemmed form.
  4923. :param word: The word that is stemmed.
  4924. :type word: str or unicode
  4925. :return: The stemmed form.
  4926. :rtype: unicode
  4927. """
  4928. word = word.lower()
  4929. if word in self.stopwords:
  4930. return word
  4931. step1_success = False
  4932. r1, r2 = self._r1r2_standard(word, self.__vowels)
  4933. rv = self._rv_standard(word, self.__vowels)
  4934. # STEP 0: Attached pronoun
  4935. for suffix in self.__step0_suffixes:
  4936. if not (word.endswith(suffix) and rv.endswith(suffix)):
  4937. continue
  4938. if (
  4939. rv[: -len(suffix)].endswith(
  4940. (
  4941. "ando",
  4942. "\xE1ndo",
  4943. "ar",
  4944. "\xE1r",
  4945. "er",
  4946. "\xE9r",
  4947. "iendo",
  4948. "i\xE9ndo",
  4949. "ir",
  4950. "\xEDr",
  4951. )
  4952. )
  4953. ) or (
  4954. rv[: -len(suffix)].endswith("yendo")
  4955. and word[: -len(suffix)].endswith("uyendo")
  4956. ):
  4957. word = self.__replace_accented(word[: -len(suffix)])
  4958. r1 = self.__replace_accented(r1[: -len(suffix)])
  4959. r2 = self.__replace_accented(r2[: -len(suffix)])
  4960. rv = self.__replace_accented(rv[: -len(suffix)])
  4961. break
  4962. # STEP 1: Standard suffix removal
  4963. for suffix in self.__step1_suffixes:
  4964. if not word.endswith(suffix):
  4965. continue
  4966. if suffix == "amente" and r1.endswith(suffix):
  4967. step1_success = True
  4968. word = word[:-6]
  4969. r2 = r2[:-6]
  4970. rv = rv[:-6]
  4971. if r2.endswith("iv"):
  4972. word = word[:-2]
  4973. r2 = r2[:-2]
  4974. rv = rv[:-2]
  4975. if r2.endswith("at"):
  4976. word = word[:-2]
  4977. rv = rv[:-2]
  4978. elif r2.endswith(("os", "ic", "ad")):
  4979. word = word[:-2]
  4980. rv = rv[:-2]
  4981. elif r2.endswith(suffix):
  4982. step1_success = True
  4983. if suffix in (
  4984. "adora",
  4985. "ador",
  4986. "aci\xF3n",
  4987. "adoras",
  4988. "adores",
  4989. "aciones",
  4990. "ante",
  4991. "antes",
  4992. "ancia",
  4993. "ancias",
  4994. ):
  4995. word = word[: -len(suffix)]
  4996. r2 = r2[: -len(suffix)]
  4997. rv = rv[: -len(suffix)]
  4998. if r2.endswith("ic"):
  4999. word = word[:-2]
  5000. rv = rv[:-2]
  5001. elif suffix in ("log\xEDa", "log\xEDas"):
  5002. word = suffix_replace(word, suffix, "log")
  5003. rv = suffix_replace(rv, suffix, "log")
  5004. elif suffix in ("uci\xF3n", "uciones"):
  5005. word = suffix_replace(word, suffix, "u")
  5006. rv = suffix_replace(rv, suffix, "u")
  5007. elif suffix in ("encia", "encias"):
  5008. word = suffix_replace(word, suffix, "ente")
  5009. rv = suffix_replace(rv, suffix, "ente")
  5010. elif suffix == "mente":
  5011. word = word[: -len(suffix)]
  5012. r2 = r2[: -len(suffix)]
  5013. rv = rv[: -len(suffix)]
  5014. if r2.endswith(("ante", "able", "ible")):
  5015. word = word[:-4]
  5016. rv = rv[:-4]
  5017. elif suffix in ("idad", "idades"):
  5018. word = word[: -len(suffix)]
  5019. r2 = r2[: -len(suffix)]
  5020. rv = rv[: -len(suffix)]
  5021. for pre_suff in ("abil", "ic", "iv"):
  5022. if r2.endswith(pre_suff):
  5023. word = word[: -len(pre_suff)]
  5024. rv = rv[: -len(pre_suff)]
  5025. elif suffix in ("ivo", "iva", "ivos", "ivas"):
  5026. word = word[: -len(suffix)]
  5027. r2 = r2[: -len(suffix)]
  5028. rv = rv[: -len(suffix)]
  5029. if r2.endswith("at"):
  5030. word = word[:-2]
  5031. rv = rv[:-2]
  5032. else:
  5033. word = word[: -len(suffix)]
  5034. rv = rv[: -len(suffix)]
  5035. break
  5036. # STEP 2a: Verb suffixes beginning 'y'
  5037. if not step1_success:
  5038. for suffix in self.__step2a_suffixes:
  5039. if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u":
  5040. word = word[: -len(suffix)]
  5041. rv = rv[: -len(suffix)]
  5042. break
  5043. # STEP 2b: Other verb suffixes
  5044. for suffix in self.__step2b_suffixes:
  5045. if rv.endswith(suffix):
  5046. word = word[: -len(suffix)]
  5047. rv = rv[: -len(suffix)]
  5048. if suffix in ("en", "es", "\xE9is", "emos"):
  5049. if word.endswith("gu"):
  5050. word = word[:-1]
  5051. if rv.endswith("gu"):
  5052. rv = rv[:-1]
  5053. break
  5054. # STEP 3: Residual suffix
  5055. for suffix in self.__step3_suffixes:
  5056. if rv.endswith(suffix):
  5057. word = word[: -len(suffix)]
  5058. if suffix in ("e", "\xE9"):
  5059. rv = rv[: -len(suffix)]
  5060. if word[-2:] == "gu" and rv.endswith("u"):
  5061. word = word[:-1]
  5062. break
  5063. word = self.__replace_accented(word)
  5064. return word
  5065. def __replace_accented(self, word):
  5066. """
  5067. Replaces all accented letters on a word with their non-accented
  5068. counterparts.
  5069. :param word: A spanish word, with or without accents
  5070. :type word: str or unicode
  5071. :return: a word with the accented letters (á, é, í, ó, ú) replaced with
  5072. their non-accented counterparts (a, e, i, o, u)
  5073. :rtype: str or unicode
  5074. """
  5075. return (
  5076. word.replace("\xE1", "a")
  5077. .replace("\xE9", "e")
  5078. .replace("\xED", "i")
  5079. .replace("\xF3", "o")
  5080. .replace("\xFA", "u")
  5081. )
  5082. class SwedishStemmer(_ScandinavianStemmer):
  5083. """
  5084. The Swedish Snowball stemmer.
  5085. :cvar __vowels: The Swedish vowels.
  5086. :type __vowels: unicode
  5087. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  5088. :type __s_ending: unicode
  5089. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  5090. :type __step1_suffixes: tuple
  5091. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  5092. :type __step2_suffixes: tuple
  5093. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  5094. :type __step3_suffixes: tuple
  5095. :note: A detailed description of the Swedish
  5096. stemming algorithm can be found under
  5097. http://snowball.tartarus.org/algorithms/swedish/stemmer.html
  5098. """
  5099. __vowels = "aeiouy\xE4\xE5\xF6"
  5100. __s_ending = "bcdfghjklmnoprtvy"
  5101. __step1_suffixes = (
  5102. "heterna",
  5103. "hetens",
  5104. "heter",
  5105. "heten",
  5106. "anden",
  5107. "arnas",
  5108. "ernas",
  5109. "ornas",
  5110. "andes",
  5111. "andet",
  5112. "arens",
  5113. "arna",
  5114. "erna",
  5115. "orna",
  5116. "ande",
  5117. "arne",
  5118. "aste",
  5119. "aren",
  5120. "ades",
  5121. "erns",
  5122. "ade",
  5123. "are",
  5124. "ern",
  5125. "ens",
  5126. "het",
  5127. "ast",
  5128. "ad",
  5129. "en",
  5130. "ar",
  5131. "er",
  5132. "or",
  5133. "as",
  5134. "es",
  5135. "at",
  5136. "a",
  5137. "e",
  5138. "s",
  5139. )
  5140. __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
  5141. __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig")
  5142. def stem(self, word):
  5143. """
  5144. Stem a Swedish word and return the stemmed form.
  5145. :param word: The word that is stemmed.
  5146. :type word: str or unicode
  5147. :return: The stemmed form.
  5148. :rtype: unicode
  5149. """
  5150. word = word.lower()
  5151. if word in self.stopwords:
  5152. return word
  5153. r1 = self._r1_scandinavian(word, self.__vowels)
  5154. # STEP 1
  5155. for suffix in self.__step1_suffixes:
  5156. if r1.endswith(suffix):
  5157. if suffix == "s":
  5158. if word[-2] in self.__s_ending:
  5159. word = word[:-1]
  5160. r1 = r1[:-1]
  5161. else:
  5162. word = word[: -len(suffix)]
  5163. r1 = r1[: -len(suffix)]
  5164. break
  5165. # STEP 2
  5166. for suffix in self.__step2_suffixes:
  5167. if r1.endswith(suffix):
  5168. word = word[:-1]
  5169. r1 = r1[:-1]
  5170. break
  5171. # STEP 3
  5172. for suffix in self.__step3_suffixes:
  5173. if r1.endswith(suffix):
  5174. if suffix in ("els", "lig", "ig"):
  5175. word = word[: -len(suffix)]
  5176. elif suffix in ("fullt", "l\xF6st"):
  5177. word = word[:-1]
  5178. break
  5179. return word
  5180. def demo():
  5181. """
  5182. This function provides a demonstration of the Snowball stemmers.
  5183. After invoking this function and specifying a language,
  5184. it stems an excerpt of the Universal Declaration of Human Rights
  5185. (which is a part of the NLTK corpus collection) and then prints
  5186. out the original and the stemmed text.
  5187. """
  5188. from nltk.corpus import udhr
  5189. udhr_corpus = {
  5190. "arabic": "Arabic_Alarabia-Arabic",
  5191. "danish": "Danish_Dansk-Latin1",
  5192. "dutch": "Dutch_Nederlands-Latin1",
  5193. "english": "English-Latin1",
  5194. "finnish": "Finnish_Suomi-Latin1",
  5195. "french": "French_Francais-Latin1",
  5196. "german": "German_Deutsch-Latin1",
  5197. "hungarian": "Hungarian_Magyar-UTF8",
  5198. "italian": "Italian_Italiano-Latin1",
  5199. "norwegian": "Norwegian-Latin1",
  5200. "porter": "English-Latin1",
  5201. "portuguese": "Portuguese_Portugues-Latin1",
  5202. "romanian": "Romanian_Romana-Latin2",
  5203. "russian": "Russian-UTF8",
  5204. "spanish": "Spanish-Latin1",
  5205. "swedish": "Swedish_Svenska-Latin1",
  5206. }
  5207. print("\n")
  5208. print("******************************")
  5209. print("Demo for the Snowball stemmers")
  5210. print("******************************")
  5211. while True:
  5212. language = input(
  5213. "Please enter the name of the language "
  5214. + "to be demonstrated\n"
  5215. + "/".join(SnowballStemmer.languages)
  5216. + "\n"
  5217. + "(enter 'exit' in order to leave): "
  5218. )
  5219. if language == "exit":
  5220. break
  5221. if language not in SnowballStemmer.languages:
  5222. print(
  5223. (
  5224. "\nOops, there is no stemmer for this language. "
  5225. + "Please try again.\n"
  5226. )
  5227. )
  5228. continue
  5229. stemmer = SnowballStemmer(language)
  5230. excerpt = udhr.words(udhr_corpus[language])[:300]
  5231. stemmed = " ".join(stemmer.stem(word) for word in excerpt)
  5232. stemmed = re.sub(r"(.{,70})\s", r'\1\n', stemmed + ' ').rstrip()
  5233. excerpt = " ".join(excerpt)
  5234. excerpt = re.sub(r"(.{,70})\s", r'\1\n', excerpt + ' ').rstrip()
  5235. print("\n")
  5236. print('-' * 70)
  5237. print('ORIGINAL'.center(70))
  5238. print(excerpt)
  5239. print("\n\n")
  5240. print('STEMMED RESULTS'.center(70))
  5241. print(stemmed)
  5242. print('-' * 70)
  5243. print("\n")