1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489 |
- # Natural Language Toolkit: Regular Expression Chunkers
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com> (minor additions)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function, unicode_literals
- from __future__ import division
- import re
- from six import string_types
- from nltk.tree import Tree
- from nltk.chunk.api import ChunkParserI
- from nltk.compat import python_2_unicode_compatible, unicode_repr
- ##//////////////////////////////////////////////////////
- ## ChunkString
- ##//////////////////////////////////////////////////////
- @python_2_unicode_compatible
- class ChunkString(object):
- """
- A string-based encoding of a particular chunking of a text.
- Internally, the ``ChunkString`` class uses a single string to
- encode the chunking of the input text. This string contains a
- sequence of angle-bracket delimited tags, with chunking indicated
- by braces. An example of this encoding is::
- {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
- ``ChunkString`` are created from tagged texts (i.e., lists of
- ``tokens`` whose type is ``TaggedType``). Initially, nothing is
- chunked.
- The chunking of a ``ChunkString`` can be modified with the ``xform()``
- method, which uses a regular expression to transform the string
- representation. These transformations should only add and remove
- braces; they should *not* modify the sequence of angle-bracket
- delimited tags.
- :type _str: str
- :ivar _str: The internal string representation of the text's
- encoding. This string representation contains a sequence of
- angle-bracket delimited tags, with chunking indicated by
- braces. An example of this encoding is::
- {<DT><JJ><NN>}<VBN><IN>{<DT><NN>}<.>{<DT><NN>}<VBD><.>
- :type _pieces: list(tagged tokens and chunks)
- :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``.
- :ivar _debug: The debug level. See the constructor docs.
- :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that
- will only match positions that are in chunks.
- :cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that
- will only match positions that are in chinks.
- """
- CHUNK_TAG_CHAR = r'[^\{\}<>]'
- CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR
- IN_CHUNK_PATTERN = r'(?=[^\{]*\})'
- IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))'
- # These are used by _verify
- _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG
- _CHINK = r'(%s+?)+?' % CHUNK_TAG
- _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG)
- _BRACKETS = re.compile('[^\{\}]+')
- _BALANCED_BRACKETS = re.compile(r'(\{\})*$')
- def __init__(self, chunk_struct, debug_level=1):
- """
- Construct a new ``ChunkString`` that encodes the chunking of
- the text ``tagged_tokens``.
- :type chunk_struct: Tree
- :param chunk_struct: The chunk structure to be further chunked.
- :type debug_level: int
- :param debug_level: The level of debugging which should be
- applied to transformations on the ``ChunkString``. The
- valid levels are:
- - 0: no checks
- - 1: full check on to_chunkstruct
- - 2: full check on to_chunkstruct and cursory check after
- each transformation.
- - 3: full check on to_chunkstruct and full check after
- each transformation.
- We recommend you use at least level 1. You should
- probably use level 3 if you use any non-standard
- subclasses of ``RegexpChunkRule``.
- """
- self._root_label = chunk_struct.label()
- self._pieces = chunk_struct[:]
- tags = [self._tag(tok) for tok in self._pieces]
- self._str = '<' + '><'.join(tags) + '>'
- self._debug = debug_level
- def _tag(self, tok):
- if isinstance(tok, tuple):
- return tok[1]
- elif isinstance(tok, Tree):
- return tok.label()
- else:
- raise ValueError('chunk structures must contain tagged ' 'tokens or trees')
- def _verify(self, s, verify_tags):
- """
- Check to make sure that ``s`` still corresponds to some chunked
- version of ``_pieces``.
- :type verify_tags: bool
- :param verify_tags: Whether the individual tags should be
- checked. If this is false, ``_verify`` will check to make
- sure that ``_str`` encodes a chunked version of *some*
- list of tokens. If this is true, then ``_verify`` will
- check to make sure that the tags in ``_str`` match those in
- ``_pieces``.
- :raise ValueError: if the internal string representation of
- this ``ChunkString`` is invalid or not consistent with _pieces.
- """
- # Check overall form
- if not ChunkString._VALID.match(s):
- raise ValueError(
- 'Transformation generated invalid ' 'chunkstring:\n %s' % s
- )
- # Check that parens are balanced. If the string is long, we
- # have to do this in pieces, to avoid a maximum recursion
- # depth limit for regular expressions.
- brackets = ChunkString._BRACKETS.sub('', s)
- for i in range(1 + len(brackets) // 5000):
- substr = brackets[i * 5000 : i * 5000 + 5000]
- if not ChunkString._BALANCED_BRACKETS.match(substr):
- raise ValueError(
- 'Transformation generated invalid ' 'chunkstring:\n %s' % s
- )
- if verify_tags <= 0:
- return
- tags1 = (re.split(r'[\{\}<>]+', s))[1:-1]
- tags2 = [self._tag(piece) for piece in self._pieces]
- if tags1 != tags2:
- raise ValueError(
- 'Transformation generated invalid ' 'chunkstring: tag changed'
- )
- def to_chunkstruct(self, chunk_label='CHUNK'):
- """
- Return the chunk structure encoded by this ``ChunkString``.
- :rtype: Tree
- :raise ValueError: If a transformation has generated an
- invalid chunkstring.
- """
- if self._debug > 0:
- self._verify(self._str, 1)
- # Use this alternating list to create the chunkstruct.
- pieces = []
- index = 0
- piece_in_chunk = 0
- for piece in re.split('[{}]', self._str):
- # Find the list of tokens contained in this piece.
- length = piece.count('<')
- subsequence = self._pieces[index : index + length]
- # Add this list of tokens to our pieces.
- if piece_in_chunk:
- pieces.append(Tree(chunk_label, subsequence))
- else:
- pieces += subsequence
- # Update index, piece_in_chunk
- index += length
- piece_in_chunk = not piece_in_chunk
- return Tree(self._root_label, pieces)
- def xform(self, regexp, repl):
- """
- Apply the given transformation to the string encoding of this
- ``ChunkString``. In particular, find all occurrences that match
- ``regexp``, and replace them using ``repl`` (as done by
- ``re.sub``).
- This transformation should only add and remove braces; it
- should *not* modify the sequence of angle-bracket delimited
- tags. Furthermore, this transformation may not result in
- improper bracketing. Note, in particular, that bracketing may
- not be nested.
- :type regexp: str or regexp
- :param regexp: A regular expression matching the substring
- that should be replaced. This will typically include a
- named group, which can be used by ``repl``.
- :type repl: str
- :param repl: An expression specifying what should replace the
- matched substring. Typically, this will include a named
- replacement group, specified by ``regexp``.
- :rtype: None
- :raise ValueError: If this transformation generated an
- invalid chunkstring.
- """
- # Do the actual substitution
- s = re.sub(regexp, repl, self._str)
- # The substitution might have generated "empty chunks"
- # (substrings of the form "{}"). Remove them, so they don't
- # interfere with other transformations.
- s = re.sub('\{\}', '', s)
- # Make sure that the transformation was legal.
- if self._debug > 1:
- self._verify(s, self._debug - 2)
- # Commit the transformation.
- self._str = s
- def __repr__(self):
- """
- Return a string representation of this ``ChunkString``.
- It has the form::
- <ChunkString: '{<DT><JJ><NN>}<VBN><IN>{<DT><NN>}'>
- :rtype: str
- """
- return '<ChunkString: %s>' % unicode_repr(self._str)
- def __str__(self):
- """
- Return a formatted representation of this ``ChunkString``.
- This representation will include extra spaces to ensure that
- tags will line up with the representation of other
- ``ChunkStrings`` for the same text, regardless of the chunking.
- :rtype: str
- """
- # Add spaces to make everything line up.
- str = re.sub(r'>(?!\})', r'> ', self._str)
- str = re.sub(r'([^\{])<', r'\1 <', str)
- if str[0] == '<':
- str = ' ' + str
- return str
- ##//////////////////////////////////////////////////////
- ## Chunking Rules
- ##//////////////////////////////////////////////////////
- @python_2_unicode_compatible
- class RegexpChunkRule(object):
- """
- A rule specifying how to modify the chunking in a ``ChunkString``,
- using a transformational regular expression. The
- ``RegexpChunkRule`` class itself can be used to implement any
- transformational rule based on regular expressions. There are
- also a number of subclasses, which can be used to implement
- simpler types of rules, based on matching regular expressions.
- Each ``RegexpChunkRule`` has a regular expression and a
- replacement expression. When a ``RegexpChunkRule`` is "applied"
- to a ``ChunkString``, it searches the ``ChunkString`` for any
- substring that matches the regular expression, and replaces it
- using the replacement expression. This search/replace operation
- has the same semantics as ``re.sub``.
- Each ``RegexpChunkRule`` also has a description string, which
- gives a short (typically less than 75 characters) description of
- the purpose of the rule.
- This transformation defined by this ``RegexpChunkRule`` should
- only add and remove braces; it should *not* modify the sequence
- of angle-bracket delimited tags. Furthermore, this transformation
- may not result in nested or mismatched bracketing.
- """
- def __init__(self, regexp, repl, descr):
- """
- Construct a new RegexpChunkRule.
- :type regexp: regexp or str
- :param regexp: The regular expression for this ``RegexpChunkRule``.
- When this rule is applied to a ``ChunkString``, any
- substring that matches ``regexp`` will be replaced using
- the replacement string ``repl``. Note that this must be a
- normal regular expression, not a tag pattern.
- :type repl: str
- :param repl: The replacement expression for this ``RegexpChunkRule``.
- When this rule is applied to a ``ChunkString``, any substring
- that matches ``regexp`` will be replaced using ``repl``.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- if isinstance(regexp, string_types):
- regexp = re.compile(regexp)
- self._repl = repl
- self._descr = descr
- self._regexp = regexp
- def apply(self, chunkstr):
- # Keep docstring generic so we can inherit it.
- """
- Apply this rule to the given ``ChunkString``. See the
- class reference documentation for a description of what it
- means to apply a rule.
- :type chunkstr: ChunkString
- :param chunkstr: The chunkstring to which this rule is applied.
- :rtype: None
- :raise ValueError: If this transformation generated an
- invalid chunkstring.
- """
- chunkstr.xform(self._regexp, self._repl)
- def descr(self):
- """
- Return a short description of the purpose and/or effect of
- this rule.
- :rtype: str
- """
- return self._descr
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <RegexpChunkRule: '{<IN|VB.*>}'->'<IN>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return (
- '<RegexpChunkRule: '
- + unicode_repr(self._regexp.pattern)
- + '->'
- + unicode_repr(self._repl)
- + '>'
- )
- @staticmethod
- def fromstring(s):
- """
- Create a RegexpChunkRule from a string description.
- Currently, the following formats are supported::
- {regexp} # chunk rule
- }regexp{ # chink rule
- regexp}{regexp # split rule
- regexp{}regexp # merge rule
- Where ``regexp`` is a regular expression for the rule. Any
- text following the comment marker (``#``) will be used as
- the rule's description:
- >>> from nltk.chunk.regexp import RegexpChunkRule
- >>> RegexpChunkRule.fromstring('{<DT>?<NN.*>+}')
- <ChunkRule: '<DT>?<NN.*>+'>
- """
- # Split off the comment (but don't split on '\#')
- m = re.match(r'(?P<rule>(\\.|[^#])*)(?P<comment>#.*)?', s)
- rule = m.group('rule').strip()
- comment = (m.group('comment') or '')[1:].strip()
- # Pattern bodies: chunk, chink, split, merge
- try:
- if not rule:
- raise ValueError('Empty chunk pattern')
- if rule[0] == '{' and rule[-1] == '}':
- return ChunkRule(rule[1:-1], comment)
- elif rule[0] == '}' and rule[-1] == '{':
- return ChinkRule(rule[1:-1], comment)
- elif '}{' in rule:
- left, right = rule.split('}{')
- return SplitRule(left, right, comment)
- elif '{}' in rule:
- left, right = rule.split('{}')
- return MergeRule(left, right, comment)
- elif re.match('[^{}]*{[^{}]*}[^{}]*', rule):
- left, chunk, right = re.split('[{}]', rule)
- return ChunkRuleWithContext(left, chunk, right, comment)
- else:
- raise ValueError('Illegal chunk pattern: %s' % rule)
- except (ValueError, re.error):
- raise ValueError('Illegal chunk pattern: %s' % rule)
- @python_2_unicode_compatible
- class ChunkRule(RegexpChunkRule):
- """
- A rule specifying how to add chunks to a ``ChunkString``, using a
- matching tag pattern. When applied to a ``ChunkString``, it will
- find any substring that matches this tag pattern and that is not
- already part of a chunk, and create a new chunk containing that
- substring.
- """
- def __init__(self, tag_pattern, descr):
- """
- Construct a new ``ChunkRule``.
- :type tag_pattern: str
- :param tag_pattern: This rule's tag pattern. When
- applied to a ``ChunkString``, this rule will
- chunk any substring that matches this tag pattern and that
- is not already part of a chunk.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- self._pattern = tag_pattern
- regexp = re.compile(
- '(?P<chunk>%s)%s'
- % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)
- )
- RegexpChunkRule.__init__(self, regexp, '{\g<chunk>}', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <ChunkRule: '<IN|VB.*>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return '<ChunkRule: ' + unicode_repr(self._pattern) + '>'
- @python_2_unicode_compatible
- class ChinkRule(RegexpChunkRule):
- """
- A rule specifying how to remove chinks to a ``ChunkString``,
- using a matching tag pattern. When applied to a
- ``ChunkString``, it will find any substring that matches this
- tag pattern and that is contained in a chunk, and remove it
- from that chunk, thus creating two new chunks.
- """
- def __init__(self, tag_pattern, descr):
- """
- Construct a new ``ChinkRule``.
- :type tag_pattern: str
- :param tag_pattern: This rule's tag pattern. When
- applied to a ``ChunkString``, this rule will
- find any substring that matches this tag pattern and that
- is contained in a chunk, and remove it from that chunk,
- thus creating two new chunks.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- self._pattern = tag_pattern
- regexp = re.compile(
- '(?P<chink>%s)%s'
- % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)
- )
- RegexpChunkRule.__init__(self, regexp, '}\g<chink>{', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <ChinkRule: '<IN|VB.*>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return '<ChinkRule: ' + unicode_repr(self._pattern) + '>'
- @python_2_unicode_compatible
- class UnChunkRule(RegexpChunkRule):
- """
- A rule specifying how to remove chunks to a ``ChunkString``,
- using a matching tag pattern. When applied to a
- ``ChunkString``, it will find any complete chunk that matches this
- tag pattern, and un-chunk it.
- """
- def __init__(self, tag_pattern, descr):
- """
- Construct a new ``UnChunkRule``.
- :type tag_pattern: str
- :param tag_pattern: This rule's tag pattern. When
- applied to a ``ChunkString``, this rule will
- find any complete chunk that matches this tag pattern,
- and un-chunk it.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- self._pattern = tag_pattern
- regexp = re.compile('\{(?P<chunk>%s)\}' % tag_pattern2re_pattern(tag_pattern))
- RegexpChunkRule.__init__(self, regexp, '\g<chunk>', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <UnChunkRule: '<IN|VB.*>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return '<UnChunkRule: ' + unicode_repr(self._pattern) + '>'
- @python_2_unicode_compatible
- class MergeRule(RegexpChunkRule):
- """
- A rule specifying how to merge chunks in a ``ChunkString``, using
- two matching tag patterns: a left pattern, and a right pattern.
- When applied to a ``ChunkString``, it will find any chunk whose end
- matches left pattern, and immediately followed by a chunk whose
- beginning matches right pattern. It will then merge those two
- chunks into a single chunk.
- """
- def __init__(self, left_tag_pattern, right_tag_pattern, descr):
- """
- Construct a new ``MergeRule``.
- :type right_tag_pattern: str
- :param right_tag_pattern: This rule's right tag
- pattern. When applied to a ``ChunkString``, this
- rule will find any chunk whose end matches
- ``left_tag_pattern``, and immediately followed by a chunk
- whose beginning matches this pattern. It will
- then merge those two chunks into a single chunk.
- :type left_tag_pattern: str
- :param left_tag_pattern: This rule's left tag
- pattern. When applied to a ``ChunkString``, this
- rule will find any chunk whose end matches
- this pattern, and immediately followed by a chunk
- whose beginning matches ``right_tag_pattern``. It will
- then merge those two chunks into a single chunk.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- # Ensure that the individual patterns are coherent. E.g., if
- # left='(' and right=')', then this will raise an exception:
- re.compile(tag_pattern2re_pattern(left_tag_pattern))
- re.compile(tag_pattern2re_pattern(right_tag_pattern))
- self._left_tag_pattern = left_tag_pattern
- self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- '(?P<left>%s)}{(?=%s)'
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, '\g<left>', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <MergeRule: '<NN|DT|JJ>', '<NN|JJ>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return (
- '<MergeRule: '
- + unicode_repr(self._left_tag_pattern)
- + ', '
- + unicode_repr(self._right_tag_pattern)
- + '>'
- )
- @python_2_unicode_compatible
- class SplitRule(RegexpChunkRule):
- """
- A rule specifying how to split chunks in a ``ChunkString``, using
- two matching tag patterns: a left pattern, and a right pattern.
- When applied to a ``ChunkString``, it will find any chunk that
- matches the left pattern followed by the right pattern. It will
- then split the chunk into two new chunks, at the point between the
- two pattern matches.
- """
- def __init__(self, left_tag_pattern, right_tag_pattern, descr):
- """
- Construct a new ``SplitRule``.
- :type right_tag_pattern: str
- :param right_tag_pattern: This rule's right tag
- pattern. When applied to a ``ChunkString``, this rule will
- find any chunk containing a substring that matches
- ``left_tag_pattern`` followed by this pattern. It will
- then split the chunk into two new chunks at the point
- between these two matching patterns.
- :type left_tag_pattern: str
- :param left_tag_pattern: This rule's left tag
- pattern. When applied to a ``ChunkString``, this rule will
- find any chunk containing a substring that matches this
- pattern followed by ``right_tag_pattern``. It will then
- split the chunk into two new chunks at the point between
- these two matching patterns.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- # Ensure that the individual patterns are coherent. E.g., if
- # left='(' and right=')', then this will raise an exception:
- re.compile(tag_pattern2re_pattern(left_tag_pattern))
- re.compile(tag_pattern2re_pattern(right_tag_pattern))
- self._left_tag_pattern = left_tag_pattern
- self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- '(?P<left>%s)(?=%s)'
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, r'\g<left>}{', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <SplitRule: '<NN>', '<DT>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return (
- '<SplitRule: '
- + unicode_repr(self._left_tag_pattern)
- + ', '
- + unicode_repr(self._right_tag_pattern)
- + '>'
- )
- @python_2_unicode_compatible
- class ExpandLeftRule(RegexpChunkRule):
- """
- A rule specifying how to expand chunks in a ``ChunkString`` to the left,
- using two matching tag patterns: a left pattern, and a right pattern.
- When applied to a ``ChunkString``, it will find any chunk whose beginning
- matches right pattern, and immediately preceded by a chink whose
- end matches left pattern. It will then expand the chunk to incorporate
- the new material on the left.
- """
- def __init__(self, left_tag_pattern, right_tag_pattern, descr):
- """
- Construct a new ``ExpandRightRule``.
- :type right_tag_pattern: str
- :param right_tag_pattern: This rule's right tag
- pattern. When applied to a ``ChunkString``, this
- rule will find any chunk whose beginning matches
- ``right_tag_pattern``, and immediately preceded by a chink
- whose end matches this pattern. It will
- then merge those two chunks into a single chunk.
- :type left_tag_pattern: str
- :param left_tag_pattern: This rule's left tag
- pattern. When applied to a ``ChunkString``, this
- rule will find any chunk whose beginning matches
- this pattern, and immediately preceded by a chink
- whose end matches ``left_tag_pattern``. It will
- then expand the chunk to incorporate the new material on the left.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- # Ensure that the individual patterns are coherent. E.g., if
- # left='(' and right=')', then this will raise an exception:
- re.compile(tag_pattern2re_pattern(left_tag_pattern))
- re.compile(tag_pattern2re_pattern(right_tag_pattern))
- self._left_tag_pattern = left_tag_pattern
- self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- '(?P<left>%s)\{(?P<right>%s)'
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, '{\g<left>\g<right>', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <ExpandLeftRule: '<NN|DT|JJ>', '<NN|JJ>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return (
- '<ExpandLeftRule: '
- + unicode_repr(self._left_tag_pattern)
- + ', '
- + unicode_repr(self._right_tag_pattern)
- + '>'
- )
- @python_2_unicode_compatible
- class ExpandRightRule(RegexpChunkRule):
- """
- A rule specifying how to expand chunks in a ``ChunkString`` to the
- right, using two matching tag patterns: a left pattern, and a
- right pattern. When applied to a ``ChunkString``, it will find any
- chunk whose end matches left pattern, and immediately followed by
- a chink whose beginning matches right pattern. It will then
- expand the chunk to incorporate the new material on the right.
- """
- def __init__(self, left_tag_pattern, right_tag_pattern, descr):
- """
- Construct a new ``ExpandRightRule``.
- :type right_tag_pattern: str
- :param right_tag_pattern: This rule's right tag
- pattern. When applied to a ``ChunkString``, this
- rule will find any chunk whose end matches
- ``left_tag_pattern``, and immediately followed by a chink
- whose beginning matches this pattern. It will
- then merge those two chunks into a single chunk.
- :type left_tag_pattern: str
- :param left_tag_pattern: This rule's left tag
- pattern. When applied to a ``ChunkString``, this
- rule will find any chunk whose end matches
- this pattern, and immediately followed by a chink
- whose beginning matches ``right_tag_pattern``. It will
- then expand the chunk to incorporate the new material on the right.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- # Ensure that the individual patterns are coherent. E.g., if
- # left='(' and right=')', then this will raise an exception:
- re.compile(tag_pattern2re_pattern(left_tag_pattern))
- re.compile(tag_pattern2re_pattern(right_tag_pattern))
- self._left_tag_pattern = left_tag_pattern
- self._right_tag_pattern = right_tag_pattern
- regexp = re.compile(
- '(?P<left>%s)\}(?P<right>%s)'
- % (
- tag_pattern2re_pattern(left_tag_pattern),
- tag_pattern2re_pattern(right_tag_pattern),
- )
- )
- RegexpChunkRule.__init__(self, regexp, '\g<left>\g<right>}', descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <ExpandRightRule: '<NN|DT|JJ>', '<NN|JJ>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return (
- '<ExpandRightRule: '
- + unicode_repr(self._left_tag_pattern)
- + ', '
- + unicode_repr(self._right_tag_pattern)
- + '>'
- )
- @python_2_unicode_compatible
- class ChunkRuleWithContext(RegexpChunkRule):
- """
- A rule specifying how to add chunks to a ``ChunkString``, using
- three matching tag patterns: one for the left context, one for the
- chunk, and one for the right context. When applied to a
- ``ChunkString``, it will find any substring that matches the chunk
- tag pattern, is surrounded by substrings that match the two
- context patterns, and is not already part of a chunk; and create a
- new chunk containing the substring that matched the chunk tag
- pattern.
- Caveat: Both the left and right context are consumed when this
- rule matches; therefore, if you need to find overlapping matches,
- you will need to apply your rule more than once.
- """
- def __init__(
- self,
- left_context_tag_pattern,
- chunk_tag_pattern,
- right_context_tag_pattern,
- descr,
- ):
- """
- Construct a new ``ChunkRuleWithContext``.
- :type left_context_tag_pattern: str
- :param left_context_tag_pattern: A tag pattern that must match
- the left context of ``chunk_tag_pattern`` for this rule to
- apply.
- :type chunk_tag_pattern: str
- :param chunk_tag_pattern: A tag pattern that must match for this
- rule to apply. If the rule does apply, then this pattern
- also identifies the substring that will be made into a chunk.
- :type right_context_tag_pattern: str
- :param right_context_tag_pattern: A tag pattern that must match
- the right context of ``chunk_tag_pattern`` for this rule to
- apply.
- :type descr: str
- :param descr: A short description of the purpose and/or effect
- of this rule.
- """
- # Ensure that the individual patterns are coherent. E.g., if
- # left='(' and right=')', then this will raise an exception:
- re.compile(tag_pattern2re_pattern(left_context_tag_pattern))
- re.compile(tag_pattern2re_pattern(chunk_tag_pattern))
- re.compile(tag_pattern2re_pattern(right_context_tag_pattern))
- self._left_context_tag_pattern = left_context_tag_pattern
- self._chunk_tag_pattern = chunk_tag_pattern
- self._right_context_tag_pattern = right_context_tag_pattern
- regexp = re.compile(
- '(?P<left>%s)(?P<chunk>%s)(?P<right>%s)%s'
- % (
- tag_pattern2re_pattern(left_context_tag_pattern),
- tag_pattern2re_pattern(chunk_tag_pattern),
- tag_pattern2re_pattern(right_context_tag_pattern),
- ChunkString.IN_CHINK_PATTERN,
- )
- )
- replacement = r'\g<left>{\g<chunk>}\g<right>'
- RegexpChunkRule.__init__(self, regexp, replacement, descr)
- def __repr__(self):
- """
- Return a string representation of this rule. It has the form::
- <ChunkRuleWithContext: '<IN>', '<NN>', '<DT>'>
- Note that this representation does not include the
- description string; that string can be accessed
- separately with the ``descr()`` method.
- :rtype: str
- """
- return '<ChunkRuleWithContext: %r, %r, %r>' % (
- self._left_context_tag_pattern,
- self._chunk_tag_pattern,
- self._right_context_tag_pattern,
- )
- ##//////////////////////////////////////////////////////
- ## Tag Pattern Format Conversion
- ##//////////////////////////////////////////////////////
- # this should probably be made more strict than it is -- e.g., it
- # currently accepts 'foo'.
- CHUNK_TAG_PATTERN = re.compile(
- r'^((%s|<%s>)*)$' % ('([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+', '[^\{\}<>]+')
- )
- def tag_pattern2re_pattern(tag_pattern):
- """
- Convert a tag pattern to a regular expression pattern. A "tag
- pattern" is a modified version of a regular expression, designed
- for matching sequences of tags. The differences between regular
- expression patterns and tag patterns are:
- - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so
- ``'<NN>+'`` matches one or more repetitions of ``'<NN>'``, not
- ``'<NN'`` followed by one or more repetitions of ``'>'``.
- - Whitespace in tag patterns is ignored. So
- ``'<DT> | <NN>'`` is equivalant to ``'<DT>|<NN>'``
- - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so
- ``'<NN.*>'`` matches any single tag starting with ``'NN'``.
- In particular, ``tag_pattern2re_pattern`` performs the following
- transformations on the given pattern:
- - Replace '.' with '[^<>{}]'
- - Remove any whitespace
- - Add extra parens around '<' and '>', to make '<' and '>' act
- like parentheses. E.g., so that in '<NN>+', the '+' has scope
- over the entire '<NN>'; and so that in '<NN|IN>', the '|' has
- scope over 'NN' and 'IN', but not '<' or '>'.
- - Check to make sure the resulting pattern is valid.
- :type tag_pattern: str
- :param tag_pattern: The tag pattern to convert to a regular
- expression pattern.
- :raise ValueError: If ``tag_pattern`` is not a valid tag pattern.
- In particular, ``tag_pattern`` should not include braces; and it
- should not contain nested or mismatched angle-brackets.
- :rtype: str
- :return: A regular expression pattern corresponding to
- ``tag_pattern``.
- """
- # Clean up the regular expression
- tag_pattern = re.sub(r'\s', '', tag_pattern)
- tag_pattern = re.sub(r'<', '(<(', tag_pattern)
- tag_pattern = re.sub(r'>', ')>)', tag_pattern)
- # Check the regular expression
- if not CHUNK_TAG_PATTERN.match(tag_pattern):
- raise ValueError('Bad tag pattern: %r' % tag_pattern)
- # Replace "." with CHUNK_TAG_CHAR.
- # We have to do this after, since it adds {}[]<>s, which would
- # confuse CHUNK_TAG_PATTERN.
- # PRE doesn't have lookback assertions, so reverse twice, and do
- # the pattern backwards (with lookahead assertions). This can be
- # made much cleaner once we can switch back to SRE.
- def reverse_str(str):
- lst = list(str)
- lst.reverse()
- return ''.join(lst)
- tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR)
- reversed = reverse_str(tag_pattern)
- reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed)
- tag_pattern = reverse_str(reversed)
- return tag_pattern
- ##//////////////////////////////////////////////////////
- ## RegexpChunkParser
- ##//////////////////////////////////////////////////////
- @python_2_unicode_compatible
- class RegexpChunkParser(ChunkParserI):
- """
- A regular expression based chunk parser. ``RegexpChunkParser`` uses a
- sequence of "rules" to find chunks of a single type within a
- text. The chunking of the text is encoded using a ``ChunkString``,
- and each rule acts by modifying the chunking in the
- ``ChunkString``. The rules are all implemented using regular
- expression matching and substitution.
- The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``,
- ``ChinkRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``)
- define the rules that are used by ``RegexpChunkParser``. Each rule
- defines an ``apply()`` method, which modifies the chunking encoded
- by a given ``ChunkString``.
- :type _rules: list(RegexpChunkRule)
- :ivar _rules: The list of rules that should be applied to a text.
- :type _trace: int
- :ivar _trace: The default level of tracing.
- """
- def __init__(self, rules, chunk_label='NP', root_label='S', trace=0):
- """
- Construct a new ``RegexpChunkParser``.
- :type rules: list(RegexpChunkRule)
- :param rules: The sequence of rules that should be used to
- generate the chunking for a tagged text.
- :type chunk_label: str
- :param chunk_label: The node value that should be used for
- chunk subtrees. This is typically a short string
- describing the type of information contained by the chunk,
- such as ``"NP"`` for base noun phrases.
- :type root_label: str
- :param root_label: The node value that should be used for the
- top node of the chunk structure.
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- ``1`` will generate normal tracing output; and ``2`` or
- higher will generate verbose tracing output.
- """
- self._rules = rules
- self._trace = trace
- self._chunk_label = chunk_label
- self._root_label = root_label
- def _trace_apply(self, chunkstr, verbose):
- """
- Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
- turn. Generate trace output between each rule. If ``verbose``
- is true, then generate verbose output.
- :type chunkstr: ChunkString
- :param chunkstr: The chunk string to which each rule should be
- applied.
- :type verbose: bool
- :param verbose: Whether output should be verbose.
- :rtype: None
- """
- print('# Input:')
- print(chunkstr)
- for rule in self._rules:
- rule.apply(chunkstr)
- if verbose:
- print('#', rule.descr() + ' (' + unicode_repr(rule) + '):')
- else:
- print('#', rule.descr() + ':')
- print(chunkstr)
- def _notrace_apply(self, chunkstr):
- """
- Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in
- turn.
- :param chunkstr: The chunk string to which each rule should be
- applied.
- :type chunkstr: ChunkString
- :rtype: None
- """
- for rule in self._rules:
- rule.apply(chunkstr)
- def parse(self, chunk_struct, trace=None):
- """
- :type chunk_struct: Tree
- :param chunk_struct: the chunk structure to be (further) chunked
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- ``1`` will generate normal tracing output; and ``2`` or
- highter will generate verbose tracing output. This value
- overrides the trace level value that was given to the
- constructor.
- :rtype: Tree
- :return: a chunk structure that encodes the chunks in a given
- tagged sentence. A chunk is a non-overlapping linguistic
- group, such as a noun phrase. The set of chunks
- identified in the chunk structure depends on the rules
- used to define this ``RegexpChunkParser``.
- """
- if len(chunk_struct) == 0:
- print('Warning: parsing empty text')
- return Tree(self._root_label, [])
- try:
- chunk_struct.label()
- except AttributeError:
- chunk_struct = Tree(self._root_label, chunk_struct)
- # Use the default trace value?
- if trace is None:
- trace = self._trace
- chunkstr = ChunkString(chunk_struct)
- # Apply the sequence of rules to the chunkstring.
- if trace:
- verbose = trace > 1
- self._trace_apply(chunkstr, verbose)
- else:
- self._notrace_apply(chunkstr)
- # Use the chunkstring to create a chunk structure.
- return chunkstr.to_chunkstruct(self._chunk_label)
- def rules(self):
- """
- :return: the sequence of rules used by ``RegexpChunkParser``.
- :rtype: list(RegexpChunkRule)
- """
- return self._rules
- def __repr__(self):
- """
- :return: a concise string representation of this
- ``RegexpChunkParser``.
- :rtype: str
- """
- return "<RegexpChunkParser with %d rules>" % len(self._rules)
- def __str__(self):
- """
- :return: a verbose string representation of this ``RegexpChunkParser``.
- :rtype: str
- """
- s = "RegexpChunkParser with %d rules:\n" % len(self._rules)
- margin = 0
- for rule in self._rules:
- margin = max(margin, len(rule.descr()))
- if margin < 35:
- format = " %" + repr(-(margin + 3)) + "s%s\n"
- else:
- format = " %s\n %s\n"
- for rule in self._rules:
- s += format % (rule.descr(), unicode_repr(rule))
- return s[:-1]
- ##//////////////////////////////////////////////////////
- ## Chunk Grammar
- ##//////////////////////////////////////////////////////
- @python_2_unicode_compatible
- class RegexpParser(ChunkParserI):
- """
- A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of
- regular expression patterns to specify the behavior of the parser.
- The chunking of the text is encoded using a ``ChunkString``, and
- each rule acts by modifying the chunking in the ``ChunkString``.
- The rules are all implemented using regular expression matching
- and substitution.
- A grammar contains one or more clauses in the following form::
- NP:
- {<DT|JJ>} # chunk determiners and adjectives
- }<[\.VI].*>+{ # chink any tag beginning with V, I, or .
- <.*>}{<DT> # split a chunk at a determiner
- <DT|JJ>{}<NN.*> # merge chunk ending with det/adj
- # with one starting with a noun
- The patterns of a clause are executed in order. An earlier
- pattern may introduce a chunk boundary that prevents a later
- pattern from executing. Sometimes an individual pattern will
- match on multiple, overlapping extents of the input. As with
- regular expression substitution more generally, the chunker will
- identify the first match possible, then continue looking for matches
- after this one has ended.
- The clauses of a grammar are also executed in order. A cascaded
- chunk parser is one having more than one clause. The maximum depth
- of a parse tree created by this chunk parser is the same as the
- number of clauses in the grammar.
- When tracing is turned on, the comment portion of a line is displayed
- each time the corresponding pattern is applied.
- :type _start: str
- :ivar _start: The start symbol of the grammar (the root node of
- resulting trees)
- :type _stages: int
- :ivar _stages: The list of parsing stages corresponding to the grammar
- """
- def __init__(self, grammar, root_label='S', loop=1, trace=0):
- """
- Create a new chunk parser, from the given start state
- and set of chunk patterns.
- :param grammar: The grammar, or a list of RegexpChunkParser objects
- :type grammar: str or list(RegexpChunkParser)
- :param root_label: The top node of the tree being created
- :type root_label: str or Nonterminal
- :param loop: The number of times to run through the patterns
- :type loop: int
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- ``1`` will generate normal tracing output; and ``2`` or
- higher will generate verbose tracing output.
- """
- self._trace = trace
- self._stages = []
- self._grammar = grammar
- self._loop = loop
- if isinstance(grammar, string_types):
- self._read_grammar(grammar, root_label, trace)
- else:
- # Make sur the grammar looks like it has the right type:
- type_err = (
- 'Expected string or list of RegexpChunkParsers ' 'for the grammar.'
- )
- try:
- grammar = list(grammar)
- except:
- raise TypeError(type_err)
- for elt in grammar:
- if not isinstance(elt, RegexpChunkParser):
- raise TypeError(type_err)
- self._stages = grammar
- def _read_grammar(self, grammar, root_label, trace):
- """
- Helper function for __init__: read the grammar if it is a
- string.
- """
- rules = []
- lhs = None
- for line in grammar.split('\n'):
- line = line.strip()
- # New stage begins if there's an unescaped ':'
- m = re.match('(?P<nonterminal>(\\.|[^:])*)(:(?P<rule>.*))', line)
- if m:
- # Record the stage that we just completed.
- self._add_stage(rules, lhs, root_label, trace)
- # Start a new stage.
- lhs = m.group('nonterminal').strip()
- rules = []
- line = m.group('rule').strip()
- # Skip blank & comment-only lines
- if line == '' or line.startswith('#'):
- continue
- # Add the rule
- rules.append(RegexpChunkRule.fromstring(line))
- # Record the final stage
- self._add_stage(rules, lhs, root_label, trace)
- def _add_stage(self, rules, lhs, root_label, trace):
- """
- Helper function for __init__: add a new stage to the parser.
- """
- if rules != []:
- if not lhs:
- raise ValueError('Expected stage marker (eg NP:)')
- parser = RegexpChunkParser(
- rules, chunk_label=lhs, root_label=root_label, trace=trace
- )
- self._stages.append(parser)
- def parse(self, chunk_struct, trace=None):
- """
- Apply the chunk parser to this input.
- :type chunk_struct: Tree
- :param chunk_struct: the chunk structure to be (further) chunked
- (this tree is modified, and is also returned)
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- ``1`` will generate normal tracing output; and ``2`` or
- highter will generate verbose tracing output. This value
- overrides the trace level value that was given to the
- constructor.
- :return: the chunked output.
- :rtype: Tree
- """
- if trace is None:
- trace = self._trace
- for i in range(self._loop):
- for parser in self._stages:
- chunk_struct = parser.parse(chunk_struct, trace=trace)
- return chunk_struct
- def __repr__(self):
- """
- :return: a concise string representation of this ``chunk.RegexpParser``.
- :rtype: str
- """
- return "<chunk.RegexpParser with %d stages>" % len(self._stages)
- def __str__(self):
- """
- :return: a verbose string representation of this
- ``RegexpParser``.
- :rtype: str
- """
- s = "chunk.RegexpParser with %d stages:\n" % len(self._stages)
- margin = 0
- for parser in self._stages:
- s += "%s\n" % parser
- return s[:-1]
- ##//////////////////////////////////////////////////////
- ## Demonstration code
- ##//////////////////////////////////////////////////////
- def demo_eval(chunkparser, text):
- """
- Demonstration code for evaluating a chunk parser, using a
- ``ChunkScore``. This function assumes that ``text`` contains one
- sentence per line, and that each sentence has the form expected by
- ``tree.chunk``. It runs the given chunk parser on each sentence in
- the text, and scores the result. It prints the final score
- (precision, recall, and f-measure); and reports the set of chunks
- that were missed and the set of chunks that were incorrect. (At
- most 10 missing chunks and 10 incorrect chunks are reported).
- :param chunkparser: The chunkparser to be tested
- :type chunkparser: ChunkParserI
- :param text: The chunked tagged text that should be used for
- evaluation.
- :type text: str
- """
- from nltk import chunk
- from nltk.tree import Tree
- # Evaluate our chunk parser.
- chunkscore = chunk.ChunkScore()
- for sentence in text.split('\n'):
- print(sentence)
- sentence = sentence.strip()
- if not sentence:
- continue
- gold = chunk.tagstr2tree(sentence)
- tokens = gold.leaves()
- test = chunkparser.parse(Tree('S', tokens), trace=1)
- chunkscore.score(gold, test)
- print()
- print('/' + ('=' * 75) + '\\')
- print('Scoring', chunkparser)
- print(('-' * 77))
- print('Precision: %5.1f%%' % (chunkscore.precision() * 100), ' ' * 4, end=' ')
- print('Recall: %5.1f%%' % (chunkscore.recall() * 100), ' ' * 6, end=' ')
- print('F-Measure: %5.1f%%' % (chunkscore.f_measure() * 100))
- # Missed chunks.
- if chunkscore.missed():
- print('Missed:')
- missed = chunkscore.missed()
- for chunk in missed[:10]:
- print(' ', ' '.join(map(str, chunk)))
- if len(chunkscore.missed()) > 10:
- print(' ...')
- # Incorrect chunks.
- if chunkscore.incorrect():
- print('Incorrect:')
- incorrect = chunkscore.incorrect()
- for chunk in incorrect[:10]:
- print(' ', ' '.join(map(str, chunk)))
- if len(chunkscore.incorrect()) > 10:
- print(' ...')
- print('\\' + ('=' * 75) + '/')
- print()
- def demo():
- """
- A demonstration for the ``RegexpChunkParser`` class. A single text is
- parsed with four different chunk parsers, using a variety of rules
- and strategies.
- """
- from nltk import chunk, Tree
- text = """\
- [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./.
- [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./.
- [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./.
- """
- print('*' * 75)
- print('Evaluation text:')
- print(text)
- print('*' * 75)
- print()
- grammar = r"""
- NP: # NP stage
- {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
- {<NNP>+} # chunk proper nouns
- """
- cp = chunk.RegexpParser(grammar)
- demo_eval(cp, text)
- grammar = r"""
- NP:
- {<.*>} # start by chunking each tag
- }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
- <DT|JJ>{}<NN.*> # merge det/adj with nouns
- """
- cp = chunk.RegexpParser(grammar)
- demo_eval(cp, text)
- grammar = r"""
- NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns
- VP: {<TO>?<VB.*>} # VP = verb words
- """
- cp = chunk.RegexpParser(grammar)
- demo_eval(cp, text)
- grammar = r"""
- NP: {<.*>*} # start by chunking everything
- }<[\.VI].*>+{ # chink any verbs, prepositions or periods
- <.*>}{<DT> # separate on determiners
- PP: {<IN><NP>} # PP = preposition + noun phrase
- VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
- """
- cp = chunk.RegexpParser(grammar)
- demo_eval(cp, text)
- # Evaluation
- from nltk.corpus import conll2000
- print()
- print("Demonstration of empty grammar:")
- cp = chunk.RegexpParser("")
- print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))))
- print()
- print("Demonstration of accuracy evaluation using CoNLL tags:")
- grammar = r"""
- NP:
- {<.*>} # start by chunking each tag
- }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods
- <DT|JJ>{}<NN.*> # merge det/adj with nouns
- """
- cp = chunk.RegexpParser(grammar)
- print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]))
- print()
- print("Demonstration of tagged token input")
- grammar = r"""
- NP: {<.*>*} # start by chunking everything
- }<[\.VI].*>+{ # chink any verbs, prepositions or periods
- <.*>}{<DT> # separate on determiners
- PP: {<IN><NP>} # PP = preposition + noun phrase
- VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs
- """
- cp = chunk.RegexpParser(grammar)
- print(
- cp.parse(
- [
- ("the", "DT"),
- ("little", "JJ"),
- ("cat", "NN"),
- ("sat", "VBD"),
- ("on", "IN"),
- ("the", "DT"),
- ("mat", "NN"),
- (".", "."),
- ]
- )
- )
- if __name__ == '__main__':
- demo()
|