segmentation.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. # Natural Language Toolkit: Text Segmentation Metrics
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com>
  6. # David Doukhan <david.doukhan@gmail.com>
  7. # URL: <http://nltk.org/>
  8. # For license information, see LICENSE.TXT
  9. """
  10. Text Segmentation Metrics
  11. 1. Windowdiff
  12. Pevzner, L., and Hearst, M., A Critique and Improvement of
  13. an Evaluation Metric for Text Segmentation,
  14. Computational Linguistics 28, 19-36
  15. 2. Generalized Hamming Distance
  16. Bookstein A., Kulyukin V.A., Raita T.
  17. Generalized Hamming Distance
  18. Information Retrieval 5, 2002, pp 353-375
  19. Baseline implementation in C++
  20. http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html
  21. Study describing benefits of Generalized Hamming Distance Versus
  22. WindowDiff for evaluating text segmentation tasks
  23. Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ?
  24. TALN 2009
  25. 3. Pk text segmentation metric
  26. Beeferman D., Berger A., Lafferty J. (1999)
  27. Statistical Models for Text Segmentation
  28. Machine Learning, 34, 177-210
  29. """
  30. try:
  31. import numpy as np
  32. except ImportError:
  33. pass
  34. from six.moves import range
  35. def windowdiff(seg1, seg2, k, boundary="1", weighted=False):
  36. """
  37. Compute the windowdiff score for a pair of segmentations. A
  38. segmentation is any sequence over a vocabulary of two items
  39. (e.g. "0", "1"), where the specified boundary value is used to
  40. mark the edge of a segmentation.
  41. >>> s1 = "000100000010"
  42. >>> s2 = "000010000100"
  43. >>> s3 = "100000010000"
  44. >>> '%.2f' % windowdiff(s1, s1, 3)
  45. '0.00'
  46. >>> '%.2f' % windowdiff(s1, s2, 3)
  47. '0.30'
  48. >>> '%.2f' % windowdiff(s2, s3, 3)
  49. '0.80'
  50. :param seg1: a segmentation
  51. :type seg1: str or list
  52. :param seg2: a segmentation
  53. :type seg2: str or list
  54. :param k: window width
  55. :type k: int
  56. :param boundary: boundary value
  57. :type boundary: str or int or bool
  58. :param weighted: use the weighted variant of windowdiff
  59. :type weighted: boolean
  60. :rtype: float
  61. """
  62. if len(seg1) != len(seg2):
  63. raise ValueError("Segmentations have unequal length")
  64. if k > len(seg1):
  65. raise ValueError(
  66. "Window width k should be smaller or equal than segmentation lengths"
  67. )
  68. wd = 0
  69. for i in range(len(seg1) - k + 1):
  70. ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary))
  71. if weighted:
  72. wd += ndiff
  73. else:
  74. wd += min(1, ndiff)
  75. return wd / (len(seg1) - k + 1.0)
  76. # Generalized Hamming Distance
  77. def _init_mat(nrows, ncols, ins_cost, del_cost):
  78. mat = np.empty((nrows, ncols))
  79. mat[0, :] = ins_cost * np.arange(ncols)
  80. mat[:, 0] = del_cost * np.arange(nrows)
  81. return mat
  82. def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff):
  83. for i, rowi in enumerate(rowv):
  84. for j, colj in enumerate(colv):
  85. shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j]
  86. if rowi == colj:
  87. # boundaries are at the same location, no transformation required
  88. tcost = mat[i, j]
  89. elif rowi > colj:
  90. # boundary match through a deletion
  91. tcost = del_cost + mat[i, j + 1]
  92. else:
  93. # boundary match through an insertion
  94. tcost = ins_cost + mat[i + 1, j]
  95. mat[i + 1, j + 1] = min(tcost, shift_cost)
  96. def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'):
  97. """
  98. Compute the Generalized Hamming Distance for a reference and a hypothetical
  99. segmentation, corresponding to the cost related to the transformation
  100. of the hypothetical segmentation into the reference segmentation
  101. through boundary insertion, deletion and shift operations.
  102. A segmentation is any sequence over a vocabulary of two items
  103. (e.g. "0", "1"), where the specified boundary value is used to
  104. mark the edge of a segmentation.
  105. Recommended parameter values are a shift_cost_coeff of 2.
  106. Associated with a ins_cost, and del_cost equal to the mean segment
  107. length in the reference segmentation.
  108. >>> # Same examples as Kulyukin C++ implementation
  109. >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5)
  110. 0.5
  111. >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5)
  112. 2.0
  113. >>> ghd('011', '110', 1.0, 1.0, 0.5)
  114. 1.0
  115. >>> ghd('1', '0', 1.0, 1.0, 0.5)
  116. 1.0
  117. >>> ghd('111', '000', 1.0, 1.0, 0.5)
  118. 3.0
  119. >>> ghd('000', '111', 1.0, 2.0, 0.5)
  120. 6.0
  121. :param ref: the reference segmentation
  122. :type ref: str or list
  123. :param hyp: the hypothetical segmentation
  124. :type hyp: str or list
  125. :param ins_cost: insertion cost
  126. :type ins_cost: float
  127. :param del_cost: deletion cost
  128. :type del_cost: float
  129. :param shift_cost_coeff: constant used to compute the cost of a shift.
  130. shift cost = shift_cost_coeff * |i - j| where i and j are
  131. the positions indicating the shift
  132. :type shift_cost_coeff: float
  133. :param boundary: boundary value
  134. :type boundary: str or int or bool
  135. :rtype: float
  136. """
  137. ref_idx = [i for (i, val) in enumerate(ref) if val == boundary]
  138. hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary]
  139. nref_bound = len(ref_idx)
  140. nhyp_bound = len(hyp_idx)
  141. if nref_bound == 0 and nhyp_bound == 0:
  142. return 0.0
  143. elif nref_bound > 0 and nhyp_bound == 0:
  144. return nref_bound * ins_cost
  145. elif nref_bound == 0 and nhyp_bound > 0:
  146. return nhyp_bound * del_cost
  147. mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost)
  148. _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff)
  149. return mat[-1, -1]
  150. # Beeferman's Pk text segmentation evaluation metric
  151. def pk(ref, hyp, k=None, boundary='1'):
  152. """
  153. Compute the Pk metric for a pair of segmentations A segmentation
  154. is any sequence over a vocabulary of two items (e.g. "0", "1"),
  155. where the specified boundary value is used to mark the edge of a
  156. segmentation.
  157. >>> '%.2f' % pk('0100'*100, '1'*400, 2)
  158. '0.50'
  159. >>> '%.2f' % pk('0100'*100, '0'*400, 2)
  160. '0.50'
  161. >>> '%.2f' % pk('0100'*100, '0100'*100, 2)
  162. '0.00'
  163. :param ref: the reference segmentation
  164. :type ref: str or list
  165. :param hyp: the segmentation to evaluate
  166. :type hyp: str or list
  167. :param k: window size, if None, set to half of the average reference segment length
  168. :type boundary: str or int or bool
  169. :param boundary: boundary value
  170. :type boundary: str or int or bool
  171. :rtype: float
  172. """
  173. if k is None:
  174. k = int(round(len(ref) / (ref.count(boundary) * 2.0)))
  175. err = 0
  176. for i in range(len(ref) - k + 1):
  177. r = ref[i : i + k].count(boundary) > 0
  178. h = hyp[i : i + k].count(boundary) > 0
  179. if r != h:
  180. err += 1
  181. return err / (len(ref) - k + 1.0)
  182. # skip doctests if numpy is not installed
  183. def setup_module(module):
  184. from nose import SkipTest
  185. try:
  186. import numpy
  187. except ImportError:
  188. raise SkipTest("numpy is required for nltk.metrics.segmentation")