123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Transformation-based learning
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Marcus Uneson <marcus.uneson@gmail.com>
- # based on previous (nltk2) version by
- # Christopher Maloof, Edward Loper, Steven Bird
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function
- from abc import ABCMeta, abstractmethod
- from six import add_metaclass
- import itertools as it
- from nltk.tbl.feature import Feature
- from nltk.tbl.rule import Rule
- @add_metaclass(ABCMeta)
- class BrillTemplateI(object):
- """
- An interface for generating lists of transformational rules that
- apply at given sentence positions. ``BrillTemplateI`` is used by
- ``Brill`` training algorithms to generate candidate rules.
- """
- @abstractmethod
- def applicable_rules(self, tokens, i, correctTag):
- """
- Return a list of the transformational rules that would correct
- the *i*th subtoken's tag in the given token. In particular,
- return a list of zero or more rules that would change
- *tokens*[i][1] to *correctTag*, if applied to *token*[i].
- If the *i*th token already has the correct tag (i.e., if
- tagged_tokens[i][1] == correctTag), then
- ``applicable_rules()`` should return the empty list.
- :param tokens: The tagged tokens being tagged.
- :type tokens: list(tuple)
- :param i: The index of the token whose tag should be corrected.
- :type i: int
- :param correctTag: The correct tag for the *i*th token.
- :type correctTag: any
- :rtype: list(BrillRule)
- """
- @abstractmethod
- def get_neighborhood(self, token, index):
- """
- Returns the set of indices *i* such that
- ``applicable_rules(token, i, ...)`` depends on the value of
- the *index*th token of *token*.
- This method is used by the "fast" Brill tagger trainer.
- :param token: The tokens being tagged.
- :type token: list(tuple)
- :param index: The index whose neighborhood should be returned.
- :type index: int
- :rtype: set
- """
- class Template(BrillTemplateI):
- """
- A tbl Template that generates a list of L{Rule}s that apply at a given sentence
- position. In particular, each C{Template} is parameterized by a list of
- independent features (a combination of a specific
- property to extract and a list C{L} of relative positions at which to extract
- it) and generates all Rules that:
- - use the given features, each at its own independent position; and
- - are applicable to the given token.
- """
- ALLTEMPLATES = []
- # record a unique id of form "001", for each template created
- # _ids = it.count(0)
- def __init__(self, *features):
- """
- Construct a Template for generating Rules.
- Takes a list of Features. A C{Feature} is a combination
- of a specific property and its relative positions and should be
- a subclass of L{nltk.tbl.feature.Feature}.
- An alternative calling convention (kept for backwards compatibility,
- but less expressive as it only permits one feature type) is
- Template(Feature, (start1, end1), (start2, end2), ...)
- In new code, that would be better written
- Template(Feature(start1, end1), Feature(start2, end2), ...)
- #For instance, importing some features
- >>> from nltk.tbl.template import Template
- >>> from nltk.tag.brill import Word, Pos
- #create some features
- >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1]))
- #Create a single-feature template
- >>> Template(wfeat1)
- Template(Word([-1]))
- #or a two-feature one
- >>> Template(wfeat1, wfeat2)
- Template(Word([-1]),Word([1, 2]))
- #or a three-feature one with two different feature types
- >>> Template(wfeat1, wfeat2, pfeat)
- Template(Word([-1]),Word([1, 2]),Pos([-2, -1]))
- #deprecated api: Feature subclass, followed by list of (start,end) pairs
- #(permits only a single Feature)
- >>> Template(Word, (-2,-1), (0,0))
- Template(Word([-2, -1]),Word([0]))
- #incorrect specification raises TypeError
- >>> Template(Word, (-2,-1), Pos, (0,0))
- Traceback (most recent call last):
- File "<stdin>", line 1, in <module>
- File "nltk/tag/tbl/template.py", line 143, in __init__
- raise TypeError(
- TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...
- :type features: list of Features
- :param features: the features to build this Template on
- """
- # determine the calling form: either
- # Template(Feature, args1, [args2, ...)]
- # Template(Feature1(args), Feature2(args), ...)
- if all(isinstance(f, Feature) for f in features):
- self._features = features
- elif issubclass(features[0], Feature) and all(
- isinstance(a, tuple) for a in features[1:]
- ):
- self._features = [features[0](*tp) for tp in features[1:]]
- else:
- raise TypeError(
- "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..."
- )
- self.id = "{0:03d}".format(len(self.ALLTEMPLATES))
- self.ALLTEMPLATES.append(self)
- def __repr__(self):
- return "%s(%s)" % (
- self.__class__.__name__,
- ",".join([str(f) for f in self._features]),
- )
- def applicable_rules(self, tokens, index, correct_tag):
- if tokens[index][1] == correct_tag:
- return []
- # For each of this Template's features, find the conditions
- # that are applicable for the given token.
- # Then, generate one Rule for each combination of features
- # (the crossproduct of the conditions).
- applicable_conditions = self._applicable_conditions(tokens, index)
- xs = list(it.product(*applicable_conditions))
- return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs]
- def _applicable_conditions(self, tokens, index):
- """
- :returns: A set of all conditions for rules
- that are applicable to C{tokens[index]}.
- """
- conditions = []
- for feature in self._features:
- conditions.append([])
- for pos in feature.positions:
- if not (0 <= index + pos < len(tokens)):
- continue
- value = feature.extract_property(tokens, index + pos)
- conditions[-1].append((feature, value))
- return conditions
- def get_neighborhood(self, tokens, index):
- # inherit docs from BrillTemplateI
- # applicable_rules(tokens, index, ...) depends on index.
- neighborhood = set([index]) # set literal for python 2.7+
- # applicable_rules(tokens, i, ...) depends on index if
- # i+start < index <= i+end.
- allpositions = [0] + [p for feat in self._features for p in feat.positions]
- start, end = min(allpositions), max(allpositions)
- s = max(0, index + (-end))
- e = min(index + (-start) + 1, len(tokens))
- for i in range(s, e):
- neighborhood.add(i)
- return neighborhood
- @classmethod
- def expand(cls, featurelists, combinations=None, skipintersecting=True):
- """
- Factory method to mass generate Templates from a list L of lists of Features.
- #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2
- #of the sublists in L; it will output all Templates formed by the Cartesian product
- #of this selection, with duplicates and other semantically equivalent
- #forms removed. Default for combinations is (1, len(L)).
- The feature lists may have been specified
- manually, or generated from Feature.expand(). For instance,
- >>> from nltk.tbl.template import Template
- >>> from nltk.tag.brill import Word, Pos
- #creating some features
- >>> (wd_0, wd_01) = (Word([0]), Word([0,1]))
- >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3]))
- >>> list(Template.expand([[wd_0], [pos_m2]]))
- [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))]
- >>> list(Template.expand([[wd_0, wd_01], [pos_m2]]))
- [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))]
- #note: with Feature.expand(), it is very easy to generate more templates
- #than your system can handle -- for instance,
- >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False)
- >>> len(wordtpls)
- 7
- >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True)
- >>> len(postpls)
- 9
- #and now the Cartesian product of all non-empty combinations of two wordtpls and
- #two postpls, with semantic equivalents removed
- >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls]))
- >>> len(templates)
- 713
- will return a list of eight templates
- Template(Word([0])),
- Template(Word([0, 1])),
- Template(Pos([-2])),
- Template(Pos([-1])),
- Template(Pos([-2]),Word([0])),
- Template(Pos([-1]),Word([0])),
- Template(Pos([-2]),Word([0, 1])),
- Template(Pos([-1]),Word([0, 1]))]
- #Templates where one feature is a subset of another, such as
- #Template(Word([0,1]), Word([1]), will not appear in the output.
- #By default, this non-subset constraint is tightened to disjointness:
- #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out.
- #With skipintersecting=False, then such Templates are allowed
- WARNING: this method makes it very easy to fill all your memory when training
- generated templates on any real-world corpus
- :param featurelists: lists of Features, whose Cartesian product will return a set of Templates
- :type featurelists: list of (list of Features)
- :param combinations: given n featurelists: if combinations=k, all generated Templates will have
- k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n
- :type combinations: None, int, or (int, int)
- :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature)
- :type skipintersecting: bool
- :returns: generator of Templates
- """
- def nonempty_powerset(xs): # xs is a list
- # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)
- # find the correct tuple given combinations, one of {None, k, (k1,k2)}
- k = combinations # for brevity
- combrange = (
- (1, len(xs) + 1)
- if k is None
- else (k, k + 1) # n over 1 .. n over n (all non-empty combinations)
- if isinstance(k, int)
- else (k[0], k[1] + 1) # n over k (only
- ) # n over k1, n over k1+1... n over k2
- return it.chain.from_iterable(
- it.combinations(xs, r) for r in range(*combrange)
- )
- seentemplates = set()
- for picks in nonempty_powerset(featurelists):
- for pick in it.product(*picks):
- if any(
- i != j and x.issuperset(y)
- for (i, x) in enumerate(pick)
- for (j, y) in enumerate(pick)
- ):
- continue
- if skipintersecting and any(
- i != j and x.intersects(y)
- for (i, x) in enumerate(pick)
- for (j, y) in enumerate(pick)
- ):
- continue
- thistemplate = cls(*sorted(pick))
- strpick = str(thistemplate)
- #!!FIXME --this is hackish
- if strpick in seentemplates: # already added
- cls._poptemplate()
- continue
- seentemplates.add(strpick)
- yield thistemplate
- @classmethod
- def _cleartemplates(cls):
- cls.ALLTEMPLATES = []
- @classmethod
- def _poptemplate(cls):
- return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None
|