feature.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Transformation-based learning
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Marcus Uneson <marcus.uneson@gmail.com>
  6. # based on previous (nltk2) version by
  7. # Christopher Maloof, Edward Loper, Steven Bird
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from __future__ import division, print_function, unicode_literals
  11. from abc import ABCMeta, abstractmethod
  12. from six import add_metaclass
  13. @add_metaclass(ABCMeta)
  14. class Feature(object):
  15. """
  16. An abstract base class for Features. A Feature is a combination of
  17. a specific property-computing method and a list of relative positions
  18. to apply that method to.
  19. The property-computing method, M{extract_property(tokens, index)},
  20. must be implemented by every subclass. It extracts or computes a specific
  21. property for the token at the current index. Typical extract_property()
  22. methods return features such as the token text or tag; but more involved
  23. methods may consider the entire sequence M{tokens} and
  24. for instance compute the length of the sentence the token belongs to.
  25. In addition, the subclass may have a PROPERTY_NAME, which is how
  26. it will be printed (in Rules and Templates, etc). If not given, defaults
  27. to the classname.
  28. """
  29. json_tag = 'nltk.tbl.Feature'
  30. PROPERTY_NAME = None
  31. def __init__(self, positions, end=None):
  32. """
  33. Construct a Feature which may apply at C{positions}.
  34. #For instance, importing some concrete subclasses (Feature is abstract)
  35. >>> from nltk.tag.brill import Word, Pos
  36. #Feature Word, applying at one of [-2, -1]
  37. >>> Word([-2,-1])
  38. Word([-2, -1])
  39. #Positions need not be contiguous
  40. >>> Word([-2,-1, 1])
  41. Word([-2, -1, 1])
  42. #Contiguous ranges can alternatively be specified giving the
  43. #two endpoints (inclusive)
  44. >>> Pos(-3, -1)
  45. Pos([-3, -2, -1])
  46. #In two-arg form, start <= end is enforced
  47. >>> Pos(2, 1)
  48. Traceback (most recent call last):
  49. File "<stdin>", line 1, in <module>
  50. File "nltk/tbl/template.py", line 306, in __init__
  51. raise TypeError
  52. ValueError: illegal interval specification: (start=2, end=1)
  53. :type positions: list of int
  54. :param positions: the positions at which this features should apply
  55. :raises ValueError: illegal position specifications
  56. An alternative calling convention, for contiguous positions only,
  57. is Feature(start, end):
  58. :type start: int
  59. :param start: start of range where this feature should apply
  60. :type end: int
  61. :param end: end of range (NOTE: inclusive!) where this feature should apply
  62. """
  63. self.positions = None # to avoid warnings
  64. if end is None:
  65. self.positions = tuple(sorted(set(int(i) for i in positions)))
  66. else: # positions was actually not a list, but only the start index
  67. try:
  68. if positions > end:
  69. raise TypeError
  70. self.positions = tuple(range(positions, end + 1))
  71. except TypeError:
  72. # let any kind of erroneous spec raise ValueError
  73. raise ValueError(
  74. "illegal interval specification: (start={0}, end={1})".format(
  75. positions, end
  76. )
  77. )
  78. # set property name given in subclass, or otherwise name of subclass
  79. self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
  80. def encode_json_obj(self):
  81. return self.positions
  82. @classmethod
  83. def decode_json_obj(cls, obj):
  84. positions = obj
  85. return cls(positions)
  86. def __repr__(self):
  87. return "%s(%r)" % (self.__class__.__name__, list(self.positions))
  88. @classmethod
  89. def expand(cls, starts, winlens, excludezero=False):
  90. """
  91. Return a list of features, one for each start point in starts
  92. and for each window length in winlen. If excludezero is True,
  93. no Features containing 0 in its positions will be generated
  94. (many tbl trainers have a special representation for the
  95. target feature at [0])
  96. For instance, importing a concrete subclass (Feature is abstract)
  97. >>> from nltk.tag.brill import Word
  98. First argument gives the possible start positions, second the
  99. possible window lengths
  100. >>> Word.expand([-3,-2,-1], [1])
  101. [Word([-3]), Word([-2]), Word([-1])]
  102. >>> Word.expand([-2,-1], [1])
  103. [Word([-2]), Word([-1])]
  104. >>> Word.expand([-3,-2,-1], [1,2])
  105. [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]
  106. >>> Word.expand([-2,-1], [1])
  107. [Word([-2]), Word([-1])]
  108. a third optional argument excludes all Features whose positions contain zero
  109. >>> Word.expand([-2,-1,0], [1,2], excludezero=False)
  110. [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]
  111. >>> Word.expand([-2,-1,0], [1,2], excludezero=True)
  112. [Word([-2]), Word([-1]), Word([-2, -1])]
  113. All window lengths must be positive
  114. >>> Word.expand([-2,-1], [0])
  115. Traceback (most recent call last):
  116. File "<stdin>", line 1, in <module>
  117. File "nltk/tag/tbl/template.py", line 371, in expand
  118. :param starts: where to start looking for Feature
  119. ValueError: non-positive window length in [0]
  120. :param starts: where to start looking for Feature
  121. :type starts: list of ints
  122. :param winlens: window lengths where to look for Feature
  123. :type starts: list of ints
  124. :param excludezero: do not output any Feature with 0 in any of its positions.
  125. :type excludezero: bool
  126. :returns: list of Features
  127. :raises ValueError: for non-positive window lengths
  128. """
  129. if not all(x > 0 for x in winlens):
  130. raise ValueError("non-positive window length in {0}".format(winlens))
  131. xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
  132. return [cls(x) for x in xs if not (excludezero and 0 in x)]
  133. def issuperset(self, other):
  134. """
  135. Return True if this Feature always returns True when other does
  136. More precisely, return True if this feature refers to the same property as other;
  137. and this Feature looks at all positions that other does (and possibly
  138. other positions in addition).
  139. #For instance, importing a concrete subclass (Feature is abstract)
  140. >>> from nltk.tag.brill import Word, Pos
  141. >>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
  142. True
  143. >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
  144. False
  145. #Feature subclasses must agree
  146. >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
  147. False
  148. :param other: feature with which to compare
  149. :type other: (subclass of) Feature
  150. :return: True if this feature is superset, otherwise False
  151. :rtype: bool
  152. """
  153. return self.__class__ is other.__class__ and set(self.positions) >= set(
  154. other.positions
  155. )
  156. def intersects(self, other):
  157. """
  158. Return True if the positions of this Feature intersects with those of other
  159. More precisely, return True if this feature refers to the same property as other;
  160. and there is some overlap in the positions they look at.
  161. #For instance, importing a concrete subclass (Feature is abstract)
  162. >>> from nltk.tag.brill import Word, Pos
  163. >>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
  164. True
  165. >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
  166. True
  167. >>> Word([-3,-2,-1]).intersects(Word([0]))
  168. False
  169. #Feature subclasses must agree
  170. >>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
  171. False
  172. :param other: feature with which to compare
  173. :type other: (subclass of) Feature
  174. :return: True if feature classes agree and there is some overlap in the positions they look at
  175. :rtype: bool
  176. """
  177. return bool(
  178. (
  179. self.__class__ is other.__class__
  180. and set(self.positions) & set(other.positions)
  181. )
  182. )
  183. # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
  184. # it will be enough to define __lt__ and __eq__
  185. def __eq__(self, other):
  186. return self.__class__ is other.__class__ and self.positions == other.positions
  187. def __lt__(self, other):
  188. return (
  189. self.__class__.__name__ < other.__class__.__name__
  190. or
  191. # self.positions is a sorted tuple of ints
  192. self.positions < other.positions
  193. )
  194. def __ne__(self, other):
  195. return not (self == other)
  196. def __gt__(self, other):
  197. return other < self
  198. def __ge__(self, other):
  199. return not self < other
  200. def __le__(self, other):
  201. return self < other or self == other
  202. @staticmethod
  203. @abstractmethod
  204. def extract_property(tokens, index):
  205. """
  206. Any subclass of Feature must define static method extract_property(tokens, index)
  207. :param tokens: the sequence of tokens
  208. :type tokens: list of tokens
  209. :param index: the current index
  210. :type index: int
  211. :return: feature value
  212. :rtype: any (but usually scalar)
  213. """