spearman.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # Natural Language Toolkit: Spearman Rank Correlation
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Joel Nothman <jnothman@student.usyd.edu.au>
  5. # URL: <http://nltk.org>
  6. # For license information, see LICENSE.TXT
  7. from __future__ import division
  8. """
  9. Tools for comparing ranked lists.
  10. """
  11. def _rank_dists(ranks1, ranks2):
  12. """Finds the difference between the values in ranks1 and ranks2 for keys
  13. present in both dicts. If the arguments are not dicts, they are converted
  14. from (key, rank) sequences.
  15. """
  16. ranks1 = dict(ranks1)
  17. ranks2 = dict(ranks2)
  18. for k in ranks1:
  19. try:
  20. yield k, ranks1[k] - ranks2[k]
  21. except KeyError:
  22. pass
  23. def spearman_correlation(ranks1, ranks2):
  24. """Returns the Spearman correlation coefficient for two rankings, which
  25. should be dicts or sequences of (key, rank). The coefficient ranges from
  26. -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
  27. calculated for keys in both rankings (for meaningful results, remove keys
  28. present in only one list before ranking)."""
  29. n = 0
  30. res = 0
  31. for k, d in _rank_dists(ranks1, ranks2):
  32. res += d * d
  33. n += 1
  34. try:
  35. return 1 - (6 * res / (n * (n * n - 1)))
  36. except ZeroDivisionError:
  37. # Result is undefined if only one item is ranked
  38. return 0.0
  39. def ranks_from_sequence(seq):
  40. """Given a sequence, yields each element with an increasing rank, suitable
  41. for use as an argument to ``spearman_correlation``.
  42. """
  43. return ((k, i) for i, k in enumerate(seq))
  44. def ranks_from_scores(scores, rank_gap=1e-15):
  45. """Given a sequence of (key, score) tuples, yields each key with an
  46. increasing rank, tying with previous key's rank if the difference between
  47. their scores is less than rank_gap. Suitable for use as an argument to
  48. ``spearman_correlation``.
  49. """
  50. prev_score = None
  51. rank = 0
  52. for i, (key, score) in enumerate(scores):
  53. try:
  54. if abs(score - prev_score) > rank_gap:
  55. rank = i
  56. except TypeError:
  57. pass
  58. yield key, rank
  59. prev_score = score