123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Transformation-based learning
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Marcus Uneson <marcus.uneson@gmail.com>
- # based on previous (nltk2) version by
- # Christopher Maloof, Edward Loper, Steven Bird
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import division, print_function, unicode_literals
- from abc import ABCMeta, abstractmethod
- from six import add_metaclass
- @add_metaclass(ABCMeta)
- class Feature(object):
- """
- An abstract base class for Features. A Feature is a combination of
- a specific property-computing method and a list of relative positions
- to apply that method to.
- The property-computing method, M{extract_property(tokens, index)},
- must be implemented by every subclass. It extracts or computes a specific
- property for the token at the current index. Typical extract_property()
- methods return features such as the token text or tag; but more involved
- methods may consider the entire sequence M{tokens} and
- for instance compute the length of the sentence the token belongs to.
- In addition, the subclass may have a PROPERTY_NAME, which is how
- it will be printed (in Rules and Templates, etc). If not given, defaults
- to the classname.
- """
- json_tag = 'nltk.tbl.Feature'
- PROPERTY_NAME = None
- def __init__(self, positions, end=None):
- """
- Construct a Feature which may apply at C{positions}.
- #For instance, importing some concrete subclasses (Feature is abstract)
- >>> from nltk.tag.brill import Word, Pos
- #Feature Word, applying at one of [-2, -1]
- >>> Word([-2,-1])
- Word([-2, -1])
- #Positions need not be contiguous
- >>> Word([-2,-1, 1])
- Word([-2, -1, 1])
- #Contiguous ranges can alternatively be specified giving the
- #two endpoints (inclusive)
- >>> Pos(-3, -1)
- Pos([-3, -2, -1])
- #In two-arg form, start <= end is enforced
- >>> Pos(2, 1)
- Traceback (most recent call last):
- File "<stdin>", line 1, in <module>
- File "nltk/tbl/template.py", line 306, in __init__
- raise TypeError
- ValueError: illegal interval specification: (start=2, end=1)
- :type positions: list of int
- :param positions: the positions at which this features should apply
- :raises ValueError: illegal position specifications
- An alternative calling convention, for contiguous positions only,
- is Feature(start, end):
- :type start: int
- :param start: start of range where this feature should apply
- :type end: int
- :param end: end of range (NOTE: inclusive!) where this feature should apply
- """
- self.positions = None # to avoid warnings
- if end is None:
- self.positions = tuple(sorted(set(int(i) for i in positions)))
- else: # positions was actually not a list, but only the start index
- try:
- if positions > end:
- raise TypeError
- self.positions = tuple(range(positions, end + 1))
- except TypeError:
- # let any kind of erroneous spec raise ValueError
- raise ValueError(
- "illegal interval specification: (start={0}, end={1})".format(
- positions, end
- )
- )
- # set property name given in subclass, or otherwise name of subclass
- self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__
- def encode_json_obj(self):
- return self.positions
- @classmethod
- def decode_json_obj(cls, obj):
- positions = obj
- return cls(positions)
- def __repr__(self):
- return "%s(%r)" % (self.__class__.__name__, list(self.positions))
- @classmethod
- def expand(cls, starts, winlens, excludezero=False):
- """
- Return a list of features, one for each start point in starts
- and for each window length in winlen. If excludezero is True,
- no Features containing 0 in its positions will be generated
- (many tbl trainers have a special representation for the
- target feature at [0])
- For instance, importing a concrete subclass (Feature is abstract)
- >>> from nltk.tag.brill import Word
- First argument gives the possible start positions, second the
- possible window lengths
- >>> Word.expand([-3,-2,-1], [1])
- [Word([-3]), Word([-2]), Word([-1])]
- >>> Word.expand([-2,-1], [1])
- [Word([-2]), Word([-1])]
- >>> Word.expand([-3,-2,-1], [1,2])
- [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])]
- >>> Word.expand([-2,-1], [1])
- [Word([-2]), Word([-1])]
- a third optional argument excludes all Features whose positions contain zero
- >>> Word.expand([-2,-1,0], [1,2], excludezero=False)
- [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])]
- >>> Word.expand([-2,-1,0], [1,2], excludezero=True)
- [Word([-2]), Word([-1]), Word([-2, -1])]
- All window lengths must be positive
- >>> Word.expand([-2,-1], [0])
- Traceback (most recent call last):
- File "<stdin>", line 1, in <module>
- File "nltk/tag/tbl/template.py", line 371, in expand
- :param starts: where to start looking for Feature
- ValueError: non-positive window length in [0]
- :param starts: where to start looking for Feature
- :type starts: list of ints
- :param winlens: window lengths where to look for Feature
- :type starts: list of ints
- :param excludezero: do not output any Feature with 0 in any of its positions.
- :type excludezero: bool
- :returns: list of Features
- :raises ValueError: for non-positive window lengths
- """
- if not all(x > 0 for x in winlens):
- raise ValueError("non-positive window length in {0}".format(winlens))
- xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1))
- return [cls(x) for x in xs if not (excludezero and 0 in x)]
- def issuperset(self, other):
- """
- Return True if this Feature always returns True when other does
- More precisely, return True if this feature refers to the same property as other;
- and this Feature looks at all positions that other does (and possibly
- other positions in addition).
- #For instance, importing a concrete subclass (Feature is abstract)
- >>> from nltk.tag.brill import Word, Pos
- >>> Word([-3,-2,-1]).issuperset(Word([-3,-2]))
- True
- >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0]))
- False
- #Feature subclasses must agree
- >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2]))
- False
- :param other: feature with which to compare
- :type other: (subclass of) Feature
- :return: True if this feature is superset, otherwise False
- :rtype: bool
- """
- return self.__class__ is other.__class__ and set(self.positions) >= set(
- other.positions
- )
- def intersects(self, other):
- """
- Return True if the positions of this Feature intersects with those of other
- More precisely, return True if this feature refers to the same property as other;
- and there is some overlap in the positions they look at.
- #For instance, importing a concrete subclass (Feature is abstract)
- >>> from nltk.tag.brill import Word, Pos
- >>> Word([-3,-2,-1]).intersects(Word([-3,-2]))
- True
- >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0]))
- True
- >>> Word([-3,-2,-1]).intersects(Word([0]))
- False
- #Feature subclasses must agree
- >>> Word([-3,-2,-1]).intersects(Pos([-3,-2]))
- False
- :param other: feature with which to compare
- :type other: (subclass of) Feature
- :return: True if feature classes agree and there is some overlap in the positions they look at
- :rtype: bool
- """
- return bool(
- (
- self.__class__ is other.__class__
- and set(self.positions) & set(other.positions)
- )
- )
- # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+),
- # it will be enough to define __lt__ and __eq__
- def __eq__(self, other):
- return self.__class__ is other.__class__ and self.positions == other.positions
- def __lt__(self, other):
- return (
- self.__class__.__name__ < other.__class__.__name__
- or
- # self.positions is a sorted tuple of ints
- self.positions < other.positions
- )
- def __ne__(self, other):
- return not (self == other)
- def __gt__(self, other):
- return other < self
- def __ge__(self, other):
- return not self < other
- def __le__(self, other):
- return self < other or self == other
- @staticmethod
- @abstractmethod
- def extract_property(tokens, index):
- """
- Any subclass of Feature must define static method extract_property(tokens, index)
- :param tokens: the sequence of tokens
- :type tokens: list of tokens
- :param index: the current index
- :type index: int
- :return: feature value
- :rtype: any (but usually scalar)
- """
|