123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Distance Metrics
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com>
- # Tom Lippincott <tom@cs.columbia.edu>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- #
- """
- Distance Metrics.
- Compute the distance between two items (usually strings).
- As metrics, they must satisfy the following three requirements:
- 1. d(a, a) = 0
- 2. d(a, b) >= 0
- 3. d(a, c) <= d(a, b) + d(b, c)
- """
- from __future__ import print_function
- from __future__ import division
- import warnings
- import operator
- def _edit_dist_init(len1, len2):
- lev = []
- for i in range(len1):
- lev.append([0] * len2) # initialize 2D array to zero
- for i in range(len1):
- lev[i][0] = i # column 0: 0,1,2,3,4,...
- for j in range(len2):
- lev[0][j] = j # row 0: 0,1,2,3,4,...
- return lev
- def _edit_dist_step(lev, i, j, s1, s2, substitution_cost=1, transpositions=False):
- c1 = s1[i - 1]
- c2 = s2[j - 1]
- # skipping a character in s1
- a = lev[i - 1][j] + 1
- # skipping a character in s2
- b = lev[i][j - 1] + 1
- # substitution
- c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0)
- # transposition
- d = c + 1 # never picked by default
- if transpositions and i > 1 and j > 1:
- if s1[i - 2] == c2 and s2[j - 2] == c1:
- d = lev[i - 2][j - 2] + 1
- # pick the cheapest
- lev[i][j] = min(a, b, c, d)
- def edit_distance(s1, s2, substitution_cost=1, transpositions=False):
- """
- Calculate the Levenshtein edit-distance between two strings.
- The edit distance is the number of characters that need to be
- substituted, inserted, or deleted, to transform s1 into s2. For
- example, transforming "rain" to "shine" requires three steps,
- consisting of two substitutions and one insertion:
- "rain" -> "sain" -> "shin" -> "shine". These operations could have
- been done in other orders, but at least three steps are needed.
- Allows specifying the cost of substitution edits (e.g., "a" -> "b"),
- because sometimes it makes sense to assign greater penalties to
- substitutions.
- This also optionally allows transposition edits (e.g., "ab" -> "ba"),
- though this is disabled by default.
- :param s1, s2: The strings to be analysed
- :param transpositions: Whether to allow transposition edits
- :type s1: str
- :type s2: str
- :type substitution_cost: int
- :type transpositions: bool
- :rtype int
- """
- # set up a 2-D array
- len1 = len(s1)
- len2 = len(s2)
- lev = _edit_dist_init(len1 + 1, len2 + 1)
- # iterate over the array
- for i in range(len1):
- for j in range(len2):
- _edit_dist_step(
- lev,
- i + 1,
- j + 1,
- s1,
- s2,
- substitution_cost=substitution_cost,
- transpositions=transpositions,
- )
- return lev[len1][len2]
- def _edit_dist_backtrace(lev):
- i, j = len(lev) - 1, len(lev[0]) - 1
- alignment = [(i, j)]
- while (i, j) != (0, 0):
- directions = [
- (i - 1, j), # skip s1
- (i, j - 1), # skip s2
- (i - 1, j - 1), # substitution
- ]
- direction_costs = (
- (lev[i][j] if (i >= 0 and j >= 0) else float('inf'), (i, j)) for i, j in directions
- )
- _, (i, j) = min(direction_costs, key=operator.itemgetter(0))
- alignment.append((i, j))
- return list(reversed(alignment))
- def edit_distance_align(s1, s2, substitution_cost=1):
- """
- Calculate the minimum Levenshtein edit-distance based alignment
- mapping between two strings. The alignment finds the mapping
- from string s1 to s2 that minimizes the edit distance cost.
- For example, mapping "rain" to "shine" would involve 2
- substitutions, 2 matches and an insertion resulting in
- the following mapping:
- [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)]
- NB: (0, 0) is the start state without any letters associated
- See more: https://web.stanford.edu/class/cs124/lec/med.pdf
- In case of multiple valid minimum-distance alignments, the
- backtrace has the following operation precedence:
- 1. Skip s1 character
- 2. Skip s2 character
- 3. Substitute s1 and s2 characters
- The backtrace is carried out in reverse string order.
- This function does not support transposition.
- :param s1, s2: The strings to be aligned
- :type s1: str
- :type s2: str
- :type substitution_cost: int
- :rtype List[Tuple(int, int)]
- """
- # set up a 2-D array
- len1 = len(s1)
- len2 = len(s2)
- lev = _edit_dist_init(len1 + 1, len2 + 1)
- # iterate over the array
- for i in range(len1):
- for j in range(len2):
- _edit_dist_step(
- lev,
- i + 1,
- j + 1,
- s1,
- s2,
- substitution_cost=substitution_cost,
- transpositions=False,
- )
- # backtrace to find alignment
- alignment = _edit_dist_backtrace(lev)
- return alignment
- def binary_distance(label1, label2):
- """Simple equality test.
- 0.0 if the labels are identical, 1.0 if they are different.
- >>> from nltk.metrics import binary_distance
- >>> binary_distance(1,1)
- 0.0
- >>> binary_distance(1,3)
- 1.0
- """
- return 0.0 if label1 == label2 else 1.0
- def jaccard_distance(label1, label2):
- """Distance metric comparing set-similarity.
- """
- return (len(label1.union(label2)) - len(label1.intersection(label2))) / len(
- label1.union(label2)
- )
- def masi_distance(label1, label2):
- """Distance metric that takes into account partial agreement when multiple
- labels are assigned.
- >>> from nltk.metrics import masi_distance
- >>> masi_distance(set([1, 2]), set([1, 2, 3, 4]))
- 0.665
- Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI)
- for Semantic and Pragmatic Annotation.
- """
- len_intersection = len(label1.intersection(label2))
- len_union = len(label1.union(label2))
- len_label1 = len(label1)
- len_label2 = len(label2)
- if len_label1 == len_label2 and len_label1 == len_intersection:
- m = 1
- elif len_intersection == min(len_label1, len_label2):
- m = 0.67
- elif len_intersection > 0:
- m = 0.33
- else:
- m = 0
- return 1 - len_intersection / len_union * m
- def interval_distance(label1, label2):
- """Krippendorff's interval distance metric
- >>> from nltk.metrics import interval_distance
- >>> interval_distance(1,10)
- 81
- Krippendorff 1980, Content Analysis: An Introduction to its Methodology
- """
- try:
- return pow(label1 - label2, 2)
- # return pow(list(label1)[0]-list(label2)[0],2)
- except:
- print("non-numeric labels not supported with interval distance")
- def presence(label):
- """Higher-order function to test presence of a given label
- """
- return lambda x, y: 1.0 * ((label in x) == (label in y))
- def fractional_presence(label):
- return (
- lambda x, y: abs(((1.0 / len(x)) - (1.0 / len(y))))
- * (label in x and label in y)
- or 0.0 * (label not in x and label not in y)
- or abs((1.0 / len(x))) * (label in x and label not in y)
- or ((1.0 / len(y))) * (label not in x and label in y)
- )
- def custom_distance(file):
- data = {}
- with open(file, 'r') as infile:
- for l in infile:
- labelA, labelB, dist = l.strip().split("\t")
- labelA = frozenset([labelA])
- labelB = frozenset([labelB])
- data[frozenset([labelA, labelB])] = float(dist)
- return lambda x, y: data[frozenset([x, y])]
- def jaro_similarity(s1, s2):
- """
- Computes the Jaro similarity between 2 sequences from:
- Matthew A. Jaro (1989). Advances in record linkage methodology
- as applied to the 1985 census of Tampa Florida. Journal of the
- American Statistical Association. 84 (406): 414-20.
- The Jaro distance between is the min no. of single-character transpositions
- required to change one word into another. The Jaro similarity formula from
- https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance :
- jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)
- where:
- - |s_i| is the length of string s_i
- - m is the no. of matching characters
- - t is the half no. of possible transpositions.
- """
- # First, store the length of the strings
- # because they will be re-used several times.
- len_s1, len_s2 = len(s1), len(s2)
- # The upper bound of the distance for being a matched character.
- match_bound = max(len_s1, len_s2) // 2 - 1
- # Initialize the counts for matches and transpositions.
- matches = 0 # no.of matched characters in s1 and s2
- transpositions = 0 # no. of transpositions between s1 and s2
- flagged_1 = [] # positions in s1 which are matches to some character in s2
- flagged_2 = [] # positions in s2 which are matches to some character in s1
- # Iterate through sequences, check for matches and compute transpositions.
- for i in range(len_s1): # Iterate through each character.
- upperbound = min(i + match_bound, len_s2 - 1)
- lowerbound = max(0, i - match_bound)
- for j in range(lowerbound, upperbound + 1):
- if s1[i] == s2[j] and j not in flagged_2:
- matches += 1
- flagged_1.append(i)
- flagged_2.append(j)
- break
- flagged_2.sort()
- for i, j in zip(flagged_1, flagged_2):
- if s1[i] != s2[j]:
- transpositions += 1
- if matches == 0:
- return 0
- else:
- return (
- 1
- / 3
- * (
- matches / len_s1
- + matches / len_s2
- + (matches - transpositions // 2) / matches
- )
- )
- def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4):
- """
- The Jaro Winkler distance is an extension of the Jaro similarity in:
- William E. Winkler. 1990. String Comparator Metrics and Enhanced
- Decision Rules in the Fellegi-Sunter Model of Record Linkage.
- Proceedings of the Section on Survey Research Methods.
- American Statistical Association: 354-359.
- such that:
- jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) )
- where,
- - jaro_sim is the output from the Jaro Similarity,
- see jaro_similarity()
- - l is the length of common prefix at the start of the string
- - this implementation provides an upperbound for the l value
- to keep the prefixes.A common value of this upperbound is 4.
- - p is the constant scaling factor to overweigh common prefixes.
- The Jaro-Winkler similarity will fall within the [0, 1] bound,
- given that max(p)<=0.25 , default is p=0.1 in Winkler (1990)
- Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf
- from "Table 5 Comparison of String Comparators Rescaled between 0 and 1"
- >>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"),
- ... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"),
- ... ("dixon", "dickson"), ("billy", "susan")]
- >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000]
- >>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000]
- # One way to match the values on the Winkler's paper is to provide a different
- # p scaling factor for different pairs of strings, e.g.
- >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1]
- >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
- ... assert round(jaro_similarity(s1, s2), 3) == jscore
- ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
- Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from
- "Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names"
- >>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'),
- ... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'),
- ... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'),
- ... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'),
- ... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'),
- ... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'),
- ... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'),
- ... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')]
- >>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926,
- ... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905,
- ... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000]
- >>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926,
- ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943,
- ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000]
- # One way to match the values on the Winkler's paper is to provide a different
- # p scaling factor for different pairs of strings, e.g.
- >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20,
- ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
- >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors):
- ... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]:
- ... continue # Skip bad examples from the paper.
- ... assert round(jaro_similarity(s1, s2), 3) == jscore
- ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore
- This test-case proves that the output of Jaro-Winkler similarity depends on
- the product l * p and not on the product max_l * p. Here the product max_l * p > 1
- however the product l * p <= 1
- >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3)
- 0.88
- """
- # To ensure that the output of the Jaro-Winkler's similarity
- # falls between [0,1], the product of l * p needs to be
- # also fall between [0,1].
- if not 0 <= max_l * p <= 1:
- warnings.warn(
- str(
- "The product `max_l * p` might not fall between [0,1]."
- "Jaro-Winkler similarity might not be between 0 and 1."
- )
- )
- # Compute the Jaro similarity
- jaro_sim = jaro_similarity(s1, s2)
- # Initialize the upper bound for the no. of prefixes.
- # if user did not pre-define the upperbound,
- # use shorter length between s1 and s2
- # Compute the prefix matches.
- l = 0
- # zip() will automatically loop until the end of shorter string.
- for s1_i, s2_i in zip(s1, s2):
- if s1_i == s2_i:
- l += 1
- else:
- break
- if l == max_l:
- break
- # Return the similarity value as described in docstring.
- return jaro_sim + (l * p * (1 - jaro_sim))
- def demo():
- string_distance_examples = [
- ("rain", "shine"),
- ("abcdef", "acbdef"),
- ("language", "lnaguaeg"),
- ("language", "lnaugage"),
- ("language", "lngauage"),
- ]
- for s1, s2 in string_distance_examples:
- print("Edit distance btwn '%s' and '%s':" % (s1, s2), edit_distance(s1, s2))
- print(
- "Edit dist with transpositions btwn '%s' and '%s':" % (s1, s2),
- edit_distance(s1, s2, transpositions=True),
- )
- print("Jaro similarity btwn '%s' and '%s':" % (s1, s2), jaro_similarity(s1, s2))
- print(
- "Jaro-Winkler similarity btwn '%s' and '%s':" % (s1, s2),
- jaro_winkler_similarity(s1, s2),
- )
- print(
- "Jaro-Winkler distance btwn '%s' and '%s':" % (s1, s2),
- 1 - jaro_winkler_similarity(s1, s2),
- )
- s1 = set([1, 2, 3, 4])
- s2 = set([3, 4, 5])
- print("s1:", s1)
- print("s2:", s2)
- print("Binary distance:", binary_distance(s1, s2))
- print("Jaccard distance:", jaccard_distance(s1, s2))
- print("MASI distance:", masi_distance(s1, s2))
- if __name__ == '__main__':
- demo()
|