rule.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Transformation-based learning
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Marcus Uneson <marcus.uneson@gmail.com>
  6. # based on previous (nltk2) version by
  7. # Christopher Maloof, Edward Loper, Steven Bird
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from __future__ import print_function
  11. from abc import ABCMeta, abstractmethod
  12. from six import add_metaclass
  13. from nltk.compat import python_2_unicode_compatible, unicode_repr
  14. from nltk import jsontags
  15. ######################################################################
  16. # Tag Rules
  17. ######################################################################
  18. @add_metaclass(ABCMeta)
  19. class TagRule(object):
  20. """
  21. An interface for tag transformations on a tagged corpus, as
  22. performed by tbl taggers. Each transformation finds all tokens
  23. in the corpus that are tagged with a specific original tag and
  24. satisfy a specific condition, and replaces their tags with a
  25. replacement tag. For any given transformation, the original
  26. tag, replacement tag, and condition are fixed. Conditions may
  27. depend on the token under consideration, as well as any other
  28. tokens in the corpus.
  29. Tag rules must be comparable and hashable.
  30. """
  31. def __init__(self, original_tag, replacement_tag):
  32. self.original_tag = original_tag
  33. """The tag which this TagRule may cause to be replaced."""
  34. self.replacement_tag = replacement_tag
  35. """The tag with which this TagRule may replace another tag."""
  36. def apply(self, tokens, positions=None):
  37. """
  38. Apply this rule at every position in positions where it
  39. applies to the given sentence. I.e., for each position p
  40. in *positions*, if *tokens[p]* is tagged with this rule's
  41. original tag, and satisfies this rule's condition, then set
  42. its tag to be this rule's replacement tag.
  43. :param tokens: The tagged sentence
  44. :type tokens: list(tuple(str, str))
  45. :type positions: list(int)
  46. :param positions: The positions where the transformation is to
  47. be tried. If not specified, try it at all positions.
  48. :return: The indices of tokens whose tags were changed by this
  49. rule.
  50. :rtype: int
  51. """
  52. if positions is None:
  53. positions = list(range(len(tokens)))
  54. # Determine the indices at which this rule applies.
  55. change = [i for i in positions if self.applies(tokens, i)]
  56. # Make the changes. Note: this must be done in a separate
  57. # step from finding applicable locations, since we don't want
  58. # the rule to interact with itself.
  59. for i in change:
  60. tokens[i] = (tokens[i][0], self.replacement_tag)
  61. return change
  62. @abstractmethod
  63. def applies(self, tokens, index):
  64. """
  65. :return: True if the rule would change the tag of
  66. ``tokens[index]``, False otherwise
  67. :rtype: bool
  68. :param tokens: A tagged sentence
  69. :type tokens: list(str)
  70. :param index: The index to check
  71. :type index: int
  72. """
  73. # Rules must be comparable and hashable for the algorithm to work
  74. def __eq__(self, other):
  75. raise TypeError("Rules must implement __eq__()")
  76. def __ne__(self, other):
  77. raise TypeError("Rules must implement __ne__()")
  78. def __hash__(self):
  79. raise TypeError("Rules must implement __hash__()")
  80. @python_2_unicode_compatible
  81. @jsontags.register_tag
  82. class Rule(TagRule):
  83. """
  84. A Rule checks the current corpus position for a certain set of conditions;
  85. if they are all fulfilled, the Rule is triggered, meaning that it
  86. will change tag A to tag B. For other tags than A, nothing happens.
  87. The conditions are parameters to the Rule instance. Each condition is a feature-value pair,
  88. with a set of positions to check for the value of the corresponding feature.
  89. Conceptually, the positions are joined by logical OR, and the feature set by logical AND.
  90. More formally, the Rule is then applicable to the M{n}th token iff:
  91. - The M{n}th token is tagged with the Rule's original tag; and
  92. - For each (Feature(positions), M{value}) tuple:
  93. - The value of Feature of at least one token in {n+p for p in positions}
  94. is M{value}.
  95. """
  96. json_tag = 'nltk.tbl.Rule'
  97. def __init__(self, templateid, original_tag, replacement_tag, conditions):
  98. """
  99. Construct a new Rule that changes a token's tag from
  100. C{original_tag} to C{replacement_tag} if all of the properties
  101. specified in C{conditions} hold.
  102. @type templateid: string
  103. @param templateid: the template id (a zero-padded string, '001' etc,
  104. so it will sort nicely)
  105. @type conditions: C{iterable} of C{Feature}
  106. @param conditions: A list of Feature(positions),
  107. each of which specifies that the property (computed by
  108. Feature.extract_property()) of at least one
  109. token in M{n} + p in positions is C{value}.
  110. """
  111. TagRule.__init__(self, original_tag, replacement_tag)
  112. self._conditions = conditions
  113. self.templateid = templateid
  114. def encode_json_obj(self):
  115. return {
  116. 'templateid': self.templateid,
  117. 'original': self.original_tag,
  118. 'replacement': self.replacement_tag,
  119. 'conditions': self._conditions,
  120. }
  121. @classmethod
  122. def decode_json_obj(cls, obj):
  123. return cls(
  124. obj['templateid'], obj['original'], obj['replacement'], obj['conditions']
  125. )
  126. def applies(self, tokens, index):
  127. # Inherit docs from TagRule
  128. # Does the given token have this Rule's "original tag"?
  129. if tokens[index][1] != self.original_tag:
  130. return False
  131. # Check to make sure that every condition holds.
  132. for (feature, val) in self._conditions:
  133. # Look for *any* token that satisfies the condition.
  134. for pos in feature.positions:
  135. if not (0 <= index + pos < len(tokens)):
  136. continue
  137. if feature.extract_property(tokens, index + pos) == val:
  138. break
  139. else:
  140. # No token satisfied the condition; return false.
  141. return False
  142. # Every condition checked out, so the Rule is applicable.
  143. return True
  144. def __eq__(self, other):
  145. return self is other or (
  146. other is not None
  147. and other.__class__ == self.__class__
  148. and self.original_tag == other.original_tag
  149. and self.replacement_tag == other.replacement_tag
  150. and self._conditions == other._conditions
  151. )
  152. def __ne__(self, other):
  153. return not (self == other)
  154. def __hash__(self):
  155. # Cache our hash value (justified by profiling.)
  156. try:
  157. return self.__hash
  158. except AttributeError:
  159. self.__hash = hash(repr(self))
  160. return self.__hash
  161. def __repr__(self):
  162. # Cache the repr (justified by profiling -- this is used as
  163. # a sort key when deterministic=True.)
  164. try:
  165. return self.__repr
  166. except AttributeError:
  167. self.__repr = "{0}('{1}', {2}, {3}, [{4}])".format(
  168. self.__class__.__name__,
  169. self.templateid,
  170. unicode_repr(self.original_tag),
  171. unicode_repr(self.replacement_tag),
  172. # list(self._conditions) would be simpler but will not generate
  173. # the same Rule.__repr__ in python 2 and 3 and thus break some tests
  174. ', '.join(
  175. "({0},{1})".format(f, unicode_repr(v))
  176. for (f, v) in self._conditions
  177. ),
  178. )
  179. return self.__repr
  180. def __str__(self):
  181. def _condition_to_logic(feature, value):
  182. """
  183. Return a compact, predicate-logic styled string representation
  184. of the given condition.
  185. """
  186. return '{0}:{1}@[{2}]'.format(
  187. feature.PROPERTY_NAME,
  188. value,
  189. ",".join(str(w) for w in feature.positions),
  190. )
  191. conditions = ' & '.join(
  192. [_condition_to_logic(f, v) for (f, v) in self._conditions]
  193. )
  194. s = '{0}->{1} if {2}'.format(
  195. self.original_tag, self.replacement_tag, conditions
  196. )
  197. return s
  198. def format(self, fmt):
  199. """
  200. Return a string representation of this rule.
  201. >>> from nltk.tbl.rule import Rule
  202. >>> from nltk.tag.brill import Pos
  203. >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')])
  204. r.format("str") == str(r)
  205. True
  206. >>> r.format("str")
  207. 'VB->NN if Pos:DT@[-2,-1]'
  208. r.format("repr") == repr(r)
  209. True
  210. >>> r.format("repr")
  211. "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])"
  212. >>> r.format("verbose")
  213. 'VB -> NN if the Pos of words i-2...i-1 is "DT"'
  214. >>> r.format("not_found")
  215. Traceback (most recent call last):
  216. File "<stdin>", line 1, in <module>
  217. File "nltk/tbl/rule.py", line 256, in format
  218. raise ValueError("unknown rule format spec: {0}".format(fmt))
  219. ValueError: unknown rule format spec: not_found
  220. >>>
  221. :param fmt: format specification
  222. :type fmt: str
  223. :return: string representation
  224. :rtype: str
  225. """
  226. if fmt == "str":
  227. return self.__str__()
  228. elif fmt == "repr":
  229. return self.__repr__()
  230. elif fmt == "verbose":
  231. return self._verbose_format()
  232. else:
  233. raise ValueError("unknown rule format spec: {0}".format(fmt))
  234. def _verbose_format(self):
  235. """
  236. Return a wordy, human-readable string representation
  237. of the given rule.
  238. Not sure how useful this is.
  239. """
  240. def condition_to_str(feature, value):
  241. return 'the %s of %s is "%s"' % (
  242. feature.PROPERTY_NAME,
  243. range_to_str(feature.positions),
  244. value,
  245. )
  246. def range_to_str(positions):
  247. if len(positions) == 1:
  248. p = positions[0]
  249. if p == 0:
  250. return 'this word'
  251. if p == -1:
  252. return 'the preceding word'
  253. elif p == 1:
  254. return 'the following word'
  255. elif p < 0:
  256. return 'word i-%d' % -p
  257. elif p > 0:
  258. return 'word i+%d' % p
  259. else:
  260. # for complete compatibility with the wordy format of nltk2
  261. mx = max(positions)
  262. mn = min(positions)
  263. if mx - mn == len(positions) - 1:
  264. return 'words i%+d...i%+d' % (mn, mx)
  265. else:
  266. return 'words {%s}' % (",".join("i%+d" % d for d in positions),)
  267. replacement = '%s -> %s' % (self.original_tag, self.replacement_tag)
  268. conditions = (' if ' if self._conditions else "") + ', and '.join(
  269. condition_to_str(f, v) for (f, v) in self._conditions
  270. )
  271. return replacement + conditions