1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: ALINE
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Greg Kondrak <gkondrak@ualberta.ca>
- # Geoff Bacon <bacon@berkeley.edu> (Python port)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- ALINE
- http://webdocs.cs.ualberta.ca/~kondrak/
- Copyright 2002 by Grzegorz Kondrak.
- ALINE is an algorithm for aligning phonetic sequences, described in [1].
- This module is a port of Kondrak's (2002) ALINE. It provides functions for
- phonetic sequence alignment and similarity analysis. These are useful in
- historical linguistics, sociolinguistics and synchronic phonology.
- ALINE has parameters that can be tuned for desired output. These parameters are:
- - C_skip, C_sub, C_exp, C_vwl
- - Salience weights
- - Segmental features
- In this implementation, some parameters have been changed from their default
- values as described in [1], in order to replicate published results. All changes
- are noted in comments.
- Example usage
- -------------
- # Get optimal alignment of two phonetic sequences
- >>> align('θin', 'tenwis') # doctest: +SKIP
- [[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]]
- [1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation,
- University of Toronto.
- """
- from __future__ import unicode_literals
- try:
- import numpy as np
- except ImportError:
- np = None
- # === Constants ===
- inf = float('inf')
- # Default values for maximum similarity scores (Kondrak 2002: 54)
- C_skip = 10 # Indels
- C_sub = 35 # Substitutions
- C_exp = 45 # Expansions/compressions
- C_vwl = 5 # Vowel/consonant relative weight (decreased from 10)
- consonants = [
- 'B',
- 'N',
- 'R',
- 'b',
- 'c',
- 'd',
- 'f',
- 'g',
- 'h',
- 'j',
- 'k',
- 'l',
- 'm',
- 'n',
- 'p',
- 'q',
- 'r',
- 's',
- 't',
- 'v',
- 'x',
- 'z',
- 'ç',
- 'ð',
- 'ħ',
- 'ŋ',
- 'ɖ',
- 'ɟ',
- 'ɢ',
- 'ɣ',
- 'ɦ',
- 'ɬ',
- 'ɮ',
- 'ɰ',
- 'ɱ',
- 'ɲ',
- 'ɳ',
- 'ɴ',
- 'ɸ',
- 'ɹ',
- 'ɻ',
- 'ɽ',
- 'ɾ',
- 'ʀ',
- 'ʁ',
- 'ʂ',
- 'ʃ',
- 'ʈ',
- 'ʋ',
- 'ʐ ',
- 'ʒ',
- 'ʔ',
- 'ʕ',
- 'ʙ',
- 'ʝ',
- 'β',
- 'θ',
- 'χ',
- 'ʐ',
- 'w',
- ]
- # Relevant features for comparing consonants and vowels
- R_c = [
- 'aspirated',
- 'lateral',
- 'manner',
- 'nasal',
- 'place',
- 'retroflex',
- 'syllabic',
- 'voice',
- ]
- # 'high' taken out of R_v because same as manner
- R_v = [
- 'back',
- 'lateral',
- 'long',
- 'manner',
- 'nasal',
- 'place',
- 'retroflex',
- 'round',
- 'syllabic',
- 'voice',
- ]
- # Flattened feature matrix (Kondrak 2002: 56)
- similarity_matrix = {
- # place
- 'bilabial': 1.0,
- 'labiodental': 0.95,
- 'dental': 0.9,
- 'alveolar': 0.85,
- 'retroflex': 0.8,
- 'palato-alveolar': 0.75,
- 'palatal': 0.7,
- 'velar': 0.6,
- 'uvular': 0.5,
- 'pharyngeal': 0.3,
- 'glottal': 0.1,
- 'labiovelar': 1.0,
- 'vowel': -1.0, # added 'vowel'
- # manner
- 'stop': 1.0,
- 'affricate': 0.9,
- 'fricative': 0.85, # increased fricative from 0.8
- 'trill': 0.7,
- 'tap': 0.65,
- 'approximant': 0.6,
- 'high vowel': 0.4,
- 'mid vowel': 0.2,
- 'low vowel': 0.0,
- 'vowel2': 0.5, # added vowel
- # high
- 'high': 1.0,
- 'mid': 0.5,
- 'low': 0.0,
- # back
- 'front': 1.0,
- 'central': 0.5,
- 'back': 0.0,
- # binary features
- 'plus': 1.0,
- 'minus': 0.0,
- }
- # Relative weights of phonetic features (Kondrak 2002: 55)
- salience = {
- 'syllabic': 5,
- 'place': 40,
- 'manner': 50,
- 'voice': 5, # decreased from 10
- 'nasal': 20, # increased from 10
- 'retroflex': 10,
- 'lateral': 10,
- 'aspirated': 5,
- 'long': 0, # decreased from 1
- 'high': 3, # decreased from 5
- 'back': 2, # decreased from 5
- 'round': 2, # decreased from 5
- }
- # (Kondrak 2002: 59-60)
- feature_matrix = {
- # Consonants
- 'p': {
- 'place': 'bilabial',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'b': {
- 'place': 'bilabial',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 't': {
- 'place': 'alveolar',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'd': {
- 'place': 'alveolar',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʈ': {
- 'place': 'retroflex',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɖ': {
- 'place': 'retroflex',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'c': {
- 'place': 'palatal',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɟ': {
- 'place': 'palatal',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'k': {
- 'place': 'velar',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'g': {
- 'place': 'velar',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'q': {
- 'place': 'uvular',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɢ': {
- 'place': 'uvular',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʔ': {
- 'place': 'glottal',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'm': {
- 'place': 'bilabial',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɱ': {
- 'place': 'labiodental',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'n': {
- 'place': 'alveolar',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɳ': {
- 'place': 'retroflex',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɲ': {
- 'place': 'palatal',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ŋ': {
- 'place': 'velar',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɴ': {
- 'place': 'uvular',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'N': {
- 'place': 'uvular',
- 'manner': 'stop',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'plus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʙ': {
- 'place': 'bilabial',
- 'manner': 'trill',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'B': {
- 'place': 'bilabial',
- 'manner': 'trill',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'r': {
- 'place': 'alveolar',
- 'manner': 'trill',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʀ': {
- 'place': 'uvular',
- 'manner': 'trill',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'R': {
- 'place': 'uvular',
- 'manner': 'trill',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɾ': {
- 'place': 'alveolar',
- 'manner': 'tap',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɽ': {
- 'place': 'retroflex',
- 'manner': 'tap',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɸ': {
- 'place': 'bilabial',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'β': {
- 'place': 'bilabial',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'f': {
- 'place': 'labiodental',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'v': {
- 'place': 'labiodental',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'θ': {
- 'place': 'dental',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ð': {
- 'place': 'dental',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 's': {
- 'place': 'alveolar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'z': {
- 'place': 'alveolar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʃ': {
- 'place': 'palato-alveolar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʒ': {
- 'place': 'palato-alveolar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʂ': {
- 'place': 'retroflex',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʐ': {
- 'place': 'retroflex',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ç': {
- 'place': 'palatal',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʝ': {
- 'place': 'palatal',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'x': {
- 'place': 'velar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɣ': {
- 'place': 'velar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'χ': {
- 'place': 'uvular',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʁ': {
- 'place': 'uvular',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ħ': {
- 'place': 'pharyngeal',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ʕ': {
- 'place': 'pharyngeal',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'h': {
- 'place': 'glottal',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɦ': {
- 'place': 'glottal',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɬ': {
- 'place': 'alveolar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'minus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'plus',
- 'aspirated': 'minus',
- },
- 'ɮ': {
- 'place': 'alveolar',
- 'manner': 'fricative',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'plus',
- 'aspirated': 'minus',
- },
- 'ʋ': {
- 'place': 'labiodental',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɹ': {
- 'place': 'alveolar',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɻ': {
- 'place': 'retroflex',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'plus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'j': {
- 'place': 'palatal',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'ɰ': {
- 'place': 'velar',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- 'l': {
- 'place': 'alveolar',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'plus',
- 'aspirated': 'minus',
- },
- 'w': {
- 'place': 'labiovelar',
- 'manner': 'approximant',
- 'syllabic': 'minus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'aspirated': 'minus',
- },
- # Vowels
- 'i': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'y': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'front',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'e': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'E': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'plus',
- 'aspirated': 'minus',
- },
- 'ø': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'front',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'ɛ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'œ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'front',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'æ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'low',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'a': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'low',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'A': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'low',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'plus',
- 'aspirated': 'minus',
- },
- 'ɨ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'central',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'ʉ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'central',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'ə': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'central',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'u': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'back',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'U': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'back',
- 'round': 'plus',
- 'long': 'plus',
- 'aspirated': 'minus',
- },
- 'o': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'back',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'O': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'back',
- 'round': 'plus',
- 'long': 'plus',
- 'aspirated': 'minus',
- },
- 'ɔ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'mid',
- 'back': 'back',
- 'round': 'plus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'ɒ': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'low',
- 'back': 'back',
- 'round': 'minus',
- 'long': 'minus',
- 'aspirated': 'minus',
- },
- 'I': {
- 'place': 'vowel',
- 'manner': 'vowel2',
- 'syllabic': 'plus',
- 'voice': 'plus',
- 'nasal': 'minus',
- 'retroflex': 'minus',
- 'lateral': 'minus',
- 'high': 'high',
- 'back': 'front',
- 'round': 'minus',
- 'long': 'plus',
- 'aspirated': 'minus',
- },
- }
- # === Algorithm ===
- def align(str1, str2, epsilon=0):
- """
- Compute the alignment of two phonetic strings.
- :type str1, str2: str
- :param str1, str2: Two strings to be aligned
- :type epsilon: float (0.0 to 1.0)
- :param epsilon: Adjusts threshold similarity score for near-optimal alignments
- :rtpye: list(list(tuple(str, str)))
- :return: Alignment(s) of str1 and str2
- (Kondrak 2002: 51)
- """
- if np is None:
- raise ImportError('You need numpy in order to use the align function')
- assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0."
- m = len(str1)
- n = len(str2)
- # This includes Kondrak's initialization of row 0 and column 0 to all 0s.
- S = np.zeros((m + 1, n + 1), dtype=float)
- # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense,
- # and breaks array and string indices. Make sure they never get chosen
- # by setting them to -inf.
- for i in range(1, m + 1):
- for j in range(1, n + 1):
- edit1 = S[i - 1, j] + sigma_skip(str1[i - 1])
- edit2 = S[i, j - 1] + sigma_skip(str2[j - 1])
- edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1])
- if i > 1:
- edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i])
- else:
- edit4 = -inf
- if j > 1:
- edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j])
- else:
- edit5 = -inf
- S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0)
- T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments
- alignments = []
- for i in range(1, m + 1):
- for j in range(1, n + 1):
- if S[i, j] >= T:
- alignments.append(_retrieve(i, j, 0, S, T, str1, str2, []))
- return alignments
- def _retrieve(i, j, s, S, T, str1, str2, out):
- """
- Retrieve the path through the similarity matrix S starting at (i, j).
- :rtype: list(tuple(str, str))
- :return: Alignment of str1 and str2
- """
- if S[i, j] == 0:
- return out
- else:
- if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T:
- out.insert(0, (str1[i - 1], str2[j - 2 : j]))
- _retrieve(
- i - 1,
- j - 2,
- s + sigma_exp(str1[i - 1], str2[j - 2 : j]),
- S,
- T,
- str1,
- str2,
- out,
- )
- elif (
- i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T
- ):
- out.insert(0, (str1[i - 2 : i], str2[j - 1]))
- _retrieve(
- i - 2,
- j - 1,
- s + sigma_exp(str2[j - 1], str1[i - 2 : i]),
- S,
- T,
- str1,
- str2,
- out,
- )
- elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T:
- out.insert(0, ('-', str2[j - 1]))
- _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out)
- elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T:
- out.insert(0, (str1[i - 1], '-'))
- _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out)
- elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T:
- out.insert(0, (str1[i - 1], str2[j - 1]))
- _retrieve(
- i - 1,
- j - 1,
- s + sigma_sub(str1[i - 1], str2[j - 1]),
- S,
- T,
- str1,
- str2,
- out,
- )
- return out
- def sigma_skip(p):
- """
- Returns score of an indel of P.
- (Kondrak 2002: 54)
- """
- return C_skip
- def sigma_sub(p, q):
- """
- Returns score of a substitution of P with Q.
- (Kondrak 2002: 54)
- """
- return C_sub - delta(p, q) - V(p) - V(q)
- def sigma_exp(p, q):
- """
- Returns score of an expansion/compression.
- (Kondrak 2002: 54)
- """
- q1 = q[0]
- q2 = q[1]
- return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2))
- def delta(p, q):
- """
- Return weighted sum of difference between P and Q.
- (Kondrak 2002: 54)
- """
- features = R(p, q)
- total = 0
- for f in features:
- total += diff(p, q, f) * salience[f]
- return total
- def diff(p, q, f):
- """
- Returns difference between phonetic segments P and Q for feature F.
- (Kondrak 2002: 52, 54)
- """
- p_features, q_features = feature_matrix[p], feature_matrix[q]
- return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]])
- def R(p, q):
- """
- Return relevant features for segment comparsion.
- (Kondrak 2002: 54)
- """
- if p in consonants or q in consonants:
- return R_c
- return R_v
- def V(p):
- """
- Return vowel weight if P is vowel.
- (Kondrak 2002: 54)
- """
- if p in consonants:
- return 0
- return C_vwl
- # === Test ===
- def demo():
- """
- A demonstration of the result of aligning phonetic sequences
- used in Kondrak's (2002) dissertation.
- """
- data = [pair.split(',') for pair in cognate_data.split('\n')]
- for pair in data:
- alignment = align(pair[0], pair[1])[0]
- alignment = ['({}, {})'.format(a[0], a[1]) for a in alignment]
- alignment = ' '.join(alignment)
- print('{} ~ {} : {}'.format(pair[0], pair[1], alignment))
- cognate_data = """jo,ʒə
- tu,ty
- nosotros,nu
- kjen,ki
- ke,kwa
- todos,tu
- una,ən
- dos,dø
- tres,trwa
- ombre,om
- arbol,arbrə
- pluma,plym
- kabeθa,kap
- boka,buʃ
- pje,pje
- koraθon,kœr
- ber,vwar
- benir,vənir
- deθir,dir
- pobre,povrə
- ðis,dIzes
- ðæt,das
- wat,vas
- nat,nixt
- loŋ,laŋ
- mæn,man
- fleʃ,flajʃ
- bləd,blyt
- feðər,fEdər
- hær,hAr
- ir,Or
- aj,awgə
- nowz,nAzə
- mawθ,munt
- təŋ,tsuŋə
- fut,fys
- nij,knI
- hænd,hant
- hart,herts
- livər,lEbər
- ænd,ante
- æt,ad
- blow,flAre
- ir,awris
- ijt,edere
- fiʃ,piʃkis
- flow,fluere
- staɾ,stella
- ful,plenus
- græs,gramen
- hart,kordis
- horn,korny
- aj,ego
- nij,genU
- məðər,mAter
- mawntən,mons
- nejm,nomen
- njuw,nowus
- wən,unus
- rawnd,rotundus
- sow,suere
- sit,sedere
- θrij,tres
- tuwθ,dentis
- θin,tenwis
- kinwawa,kenuaʔ
- nina,nenah
- napewa,napɛw
- wapimini,wapemen
- namesa,namɛʔs
- okimawa,okemaw
- ʃiʃipa,seʔsep
- ahkohkwa,ahkɛh
- pematesiweni,pematesewen
- asenja,aʔsɛn"""
- if __name__ == '__main__':
- demo()
|