12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- # Natural Language Toolkit: Spearman Rank Correlation
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Joel Nothman <jnothman@student.usyd.edu.au>
- # URL: <http://nltk.org>
- # For license information, see LICENSE.TXT
- from __future__ import division
- """
- Tools for comparing ranked lists.
- """
- def _rank_dists(ranks1, ranks2):
- """Finds the difference between the values in ranks1 and ranks2 for keys
- present in both dicts. If the arguments are not dicts, they are converted
- from (key, rank) sequences.
- """
- ranks1 = dict(ranks1)
- ranks2 = dict(ranks2)
- for k in ranks1:
- try:
- yield k, ranks1[k] - ranks2[k]
- except KeyError:
- pass
- def spearman_correlation(ranks1, ranks2):
- """Returns the Spearman correlation coefficient for two rankings, which
- should be dicts or sequences of (key, rank). The coefficient ranges from
- -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only
- calculated for keys in both rankings (for meaningful results, remove keys
- present in only one list before ranking)."""
- n = 0
- res = 0
- for k, d in _rank_dists(ranks1, ranks2):
- res += d * d
- n += 1
- try:
- return 1 - (6 * res / (n * (n * n - 1)))
- except ZeroDivisionError:
- # Result is undefined if only one item is ranked
- return 0.0
- def ranks_from_sequence(seq):
- """Given a sequence, yields each element with an increasing rank, suitable
- for use as an argument to ``spearman_correlation``.
- """
- return ((k, i) for i, k in enumerate(seq))
- def ranks_from_scores(scores, rank_gap=1e-15):
- """Given a sequence of (key, score) tuples, yields each key with an
- increasing rank, tying with previous key's rank if the difference between
- their scores is less than rank_gap. Suitable for use as an argument to
- ``spearman_correlation``.
- """
- prev_score = None
- rank = 0
- for i, (key, score) in enumerate(scores):
- try:
- if abs(score - prev_score) > rank_gap:
- rank = i
- except TypeError:
- pass
- yield key, rank
- prev_score = score
|