minimalset.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # Natural Language Toolkit: Minimal Sets
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Steven Bird <stevenbird1@gmail.com>
  5. # URL: <http://nltk.org>
  6. # For license information, see LICENSE.TXT
  7. from collections import defaultdict
  8. class MinimalSet(object):
  9. """
  10. Find contexts where more than one possible target value can
  11. appear. E.g. if targets are word-initial letters, and contexts
  12. are the remainders of words, then we would like to find cases like
  13. "fat" vs "cat", and "training" vs "draining". If targets are
  14. parts-of-speech and contexts are words, then we would like to find
  15. cases like wind (noun) 'air in rapid motion', vs wind (verb)
  16. 'coil, wrap'.
  17. """
  18. def __init__(self, parameters=None):
  19. """
  20. Create a new minimal set.
  21. :param parameters: The (context, target, display) tuples for the item
  22. :type parameters: list(tuple(str, str, str))
  23. """
  24. self._targets = set() # the contrastive information
  25. self._contexts = set() # what we are controlling for
  26. self._seen = defaultdict(set) # to record what we have seen
  27. self._displays = {} # what we will display
  28. if parameters:
  29. for context, target, display in parameters:
  30. self.add(context, target, display)
  31. def add(self, context, target, display):
  32. """
  33. Add a new item to the minimal set, having the specified
  34. context, target, and display form.
  35. :param context: The context in which the item of interest appears
  36. :type context: str
  37. :param target: The item of interest
  38. :type target: str
  39. :param display: The information to be reported for each item
  40. :type display: str
  41. """
  42. # Store the set of targets that occurred in this context
  43. self._seen[context].add(target)
  44. # Keep track of which contexts and targets we have seen
  45. self._contexts.add(context)
  46. self._targets.add(target)
  47. # For a given context and target, store the display form
  48. self._displays[(context, target)] = display
  49. def contexts(self, minimum=2):
  50. """
  51. Determine which contexts occurred with enough distinct targets.
  52. :param minimum: the minimum number of distinct target forms
  53. :type minimum: int
  54. :rtype list
  55. """
  56. return [c for c in self._contexts if len(self._seen[c]) >= minimum]
  57. def display(self, context, target, default=""):
  58. if (context, target) in self._displays:
  59. return self._displays[(context, target)]
  60. else:
  61. return default
  62. def display_all(self, context):
  63. result = []
  64. for target in self._targets:
  65. x = self.display(context, target)
  66. if x:
  67. result.append(x)
  68. return result
  69. def targets(self):
  70. return self._targets