123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235 |
- # Natural Language Toolkit: Text Segmentation Metrics
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com>
- # David Doukhan <david.doukhan@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Text Segmentation Metrics
- 1. Windowdiff
- Pevzner, L., and Hearst, M., A Critique and Improvement of
- an Evaluation Metric for Text Segmentation,
- Computational Linguistics 28, 19-36
- 2. Generalized Hamming Distance
- Bookstein A., Kulyukin V.A., Raita T.
- Generalized Hamming Distance
- Information Retrieval 5, 2002, pp 353-375
- Baseline implementation in C++
- http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
- Study describing benefits of Generalized Hamming Distance Versus
- WindowDiff for evaluating text segmentation tasks
- Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
- TALN 2009
- 3. Pk text segmentation metric
- Beeferman D., Berger A., Lafferty J. (1999)
- Statistical Models for Text Segmentation
- Machine Learning, 34, 177-210
- """
- try:
- import numpy as np
- except ImportError:
- pass
- from six.moves import range
- def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
- """
- Compute the windowdiff score for a pair of segmentations. A
- segmentation is any sequence over a vocabulary of two items
- (e.g. "0", "1"), where the specified boundary value is used to
- mark the edge of a segmentation.
- >>> s1 = "000100000010"
- >>> s2 = "000010000100"
- >>> s3 = "100000010000"
- >>> '%.2f' % windowdiff(s1, s1, 3)
- '0.00'
- >>> '%.2f' % windowdiff(s1, s2, 3)
- '0.30'
- >>> '%.2f' % windowdiff(s2, s3, 3)
- '0.80'
- :param seg1: a segmentation
- :type seg1: str or list
- :param seg2: a segmentation
- :type seg2: str or list
- :param k: window width
- :type k: int
- :param boundary: boundary value
- :type boundary: str or int or bool
- :param weighted: use the weighted variant of windowdiff
- :type weighted: boolean
- :rtype: float
- """
- if len(seg1) != len(seg2):
- raise ValueError("Segmentations have unequal length")
- if k > len(seg1):
- raise ValueError(
- "Window width k should be smaller or equal than segmentation lengths"
- )
- wd = 0
- for i in range(len(seg1) - k + 1):
- ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
- if weighted:
- wd += ndiff
- else:
- wd += min(1, ndiff)
- return wd / (len(seg1) - k + 1.0)
- # Generalized Hamming Distance
- def _init_mat(nrows, ncols, ins_cost, del_cost):
- mat = np.empty((nrows, ncols))
- mat[0, :] = ins_cost * np.arange(ncols)
- mat[:, 0] = del_cost * np.arange(nrows)
- return mat
- def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
- for i, rowi in enumerate(rowv):
- for j, colj in enumerate(colv):
- shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
- if rowi == colj:
- # boundaries are at the same location, no transformation required
- tcost = mat[i, j]
- elif rowi > colj:
- # boundary match through a deletion
- tcost = del_cost + mat[i, j + 1]
- else:
- # boundary match through an insertion
- tcost = ins_cost + mat[i + 1, j]
- mat[i + 1, j + 1] = min(tcost, shift_cost)
- def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
- """
- Compute the Generalized Hamming Distance for a reference and a hypothetical
- segmentation, corresponding to the cost related to the transformation
- of the hypothetical segmentation into the reference segmentation
- through boundary insertion, deletion and shift operations.
- A segmentation is any sequence over a vocabulary of two items
- (e.g. "0", "1"), where the specified boundary value is used to
- mark the edge of a segmentation.
- Recommended parameter values are a shift_cost_coeff of 2.
- Associated with a ins_cost, and del_cost equal to the mean segment
- length in the reference segmentation.
- >>> # Same examples as Kulyukin C++ implementation
- >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
- 0.5
- >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
- 2.0
- >>> ghd('011', '110', 1.0, 1.0, 0.5)
- 1.0
- >>> ghd('1', '0', 1.0, 1.0, 0.5)
- 1.0
- >>> ghd('111', '000', 1.0, 1.0, 0.5)
- 3.0
- >>> ghd('000', '111', 1.0, 2.0, 0.5)
- 6.0
- :param ref: the reference segmentation
- :type ref: str or list
- :param hyp: the hypothetical segmentation
- :type hyp: str or list
- :param ins_cost: insertion cost
- :type ins_cost: float
- :param del_cost: deletion cost
- :type del_cost: float
- :param shift_cost_coeff: constant used to compute the cost of a shift.
- shift cost = shift_cost_coeff * |i - j| where i and j are
- the positions indicating the shift
- :type shift_cost_coeff: float
- :param boundary: boundary value
- :type boundary: str or int or bool
- :rtype: float
- """
- ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
- hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
- nref_bound = len(ref_idx)
- nhyp_bound = len(hyp_idx)
- if nref_bound == 0 and nhyp_bound == 0:
- return 0.0
- elif nref_bound > 0 and nhyp_bound == 0:
- return nref_bound * ins_cost
- elif nref_bound == 0 and nhyp_bound > 0:
- return nhyp_bound * del_cost
- mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
- _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
- return mat[-1, -1]
- # Beeferman's Pk text segmentation evaluation metric
- def pk(ref, hyp, k=None, boundary='1'):
- """
- Compute the Pk metric for a pair of segmentations A segmentation
- is any sequence over a vocabulary of two items (e.g. "0", "1"),
- where the specified boundary value is used to mark the edge of a
- segmentation.
- >>> '%.2f' % pk('0100'*100, '1'*400, 2)
- '0.50'
- >>> '%.2f' % pk('0100'*100, '0'*400, 2)
- '0.50'
- >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
- '0.00'
- :param ref: the reference segmentation
- :type ref: str or list
- :param hyp: the segmentation to evaluate
- :type hyp: str or list
- :param k: window size, if None, set to half of the average reference segment length
- :type boundary: str or int or bool
- :param boundary: boundary value
- :type boundary: str or int or bool
- :rtype: float
- """
- if k is None:
- k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
- err = 0
- for i in range(len(ref) - k + 1):
- r = ref[i : i + k].count(boundary) > 0
- h = hyp[i : i + k].count(boundary) > 0
- if r != h:
- err += 1
- return err / (len(ref) - k + 1.0)
- # skip doctests if numpy is not installed
- def setup_module(module):
- from nose import SkipTest
- try:
- import numpy
- except ImportError:
- raise SkipTest("numpy is required for nltk.metrics.segmentation")
|