123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647 |
- # Natural Language Toolkit: Chunk format conversions
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # Steven Bird <stevenbird1@gmail.com> (minor additions)
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function, unicode_literals, division
- import re
- from nltk.tree import Tree
- from nltk.tag.mapping import map_tag
- from nltk.tag.util import str2tuple
- from nltk.compat import python_2_unicode_compatible
- ##//////////////////////////////////////////////////////
- ## EVALUATION
- ##//////////////////////////////////////////////////////
- from nltk.metrics import accuracy as _accuracy
- def accuracy(chunker, gold):
- """
- Score the accuracy of the chunker against the gold standard.
- Strip the chunk information from the gold standard and rechunk it using
- the chunker, then compute the accuracy score.
- :type chunker: ChunkParserI
- :param chunker: The chunker being evaluated.
- :type gold: tree
- :param gold: The chunk structures to score the chunker on.
- :rtype: float
- """
- gold_tags = []
- test_tags = []
- for gold_tree in gold:
- test_tree = chunker.parse(gold_tree.flatten())
- gold_tags += tree2conlltags(gold_tree)
- test_tags += tree2conlltags(test_tree)
- # print 'GOLD:', gold_tags[:50]
- # print 'TEST:', test_tags[:50]
- return _accuracy(gold_tags, test_tags)
- # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
- # -- statistics are evaluated only on demand, instead of at every sentence evaluation
- #
- # SB: use nltk.metrics for precision/recall scoring?
- #
- class ChunkScore(object):
- """
- A utility class for scoring chunk parsers. ``ChunkScore`` can
- evaluate a chunk parser's output, based on a number of statistics
- (precision, recall, f-measure, misssed chunks, incorrect chunks).
- It can also combine the scores from the parsing of multiple texts;
- this makes it significantly easier to evaluate a chunk parser that
- operates one sentence at a time.
- Texts are evaluated with the ``score`` method. The results of
- evaluation can be accessed via a number of accessor methods, such
- as ``precision`` and ``f_measure``. A typical use of the
- ``ChunkScore`` class is::
- >>> chunkscore = ChunkScore() # doctest: +SKIP
- >>> for correct in correct_sentences: # doctest: +SKIP
- ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
- ... chunkscore.score(correct, guess) # doctest: +SKIP
- >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
- F Measure: 0.823
- :ivar kwargs: Keyword arguments:
- - max_tp_examples: The maximum number actual examples of true
- positives to record. This affects the ``correct`` member
- function: ``correct`` will not return more than this number
- of true positive examples. This does *not* affect any of
- the numerical metrics (precision, recall, or f-measure)
- - max_fp_examples: The maximum number actual examples of false
- positives to record. This affects the ``incorrect`` member
- function and the ``guessed`` member function: ``incorrect``
- will not return more than this number of examples, and
- ``guessed`` will not return more than this number of true
- positive examples. This does *not* affect any of the
- numerical metrics (precision, recall, or f-measure)
- - max_fn_examples: The maximum number actual examples of false
- negatives to record. This affects the ``missed`` member
- function and the ``correct`` member function: ``missed``
- will not return more than this number of examples, and
- ``correct`` will not return more than this number of true
- negative examples. This does *not* affect any of the
- numerical metrics (precision, recall, or f-measure)
- - chunk_label: A regular expression indicating which chunks
- should be compared. Defaults to ``'.*'`` (i.e., all chunks).
- :type _tp: list(Token)
- :ivar _tp: List of true positives
- :type _fp: list(Token)
- :ivar _fp: List of false positives
- :type _fn: list(Token)
- :ivar _fn: List of false negatives
- :type _tp_num: int
- :ivar _tp_num: Number of true positives
- :type _fp_num: int
- :ivar _fp_num: Number of false positives
- :type _fn_num: int
- :ivar _fn_num: Number of false negatives.
- """
- def __init__(self, **kwargs):
- self._correct = set()
- self._guessed = set()
- self._tp = set()
- self._fp = set()
- self._fn = set()
- self._max_tp = kwargs.get('max_tp_examples', 100)
- self._max_fp = kwargs.get('max_fp_examples', 100)
- self._max_fn = kwargs.get('max_fn_examples', 100)
- self._chunk_label = kwargs.get('chunk_label', '.*')
- self._tp_num = 0
- self._fp_num = 0
- self._fn_num = 0
- self._count = 0
- self._tags_correct = 0.0
- self._tags_total = 0.0
- self._measuresNeedUpdate = False
- def _updateMeasures(self):
- if self._measuresNeedUpdate:
- self._tp = self._guessed & self._correct
- self._fn = self._correct - self._guessed
- self._fp = self._guessed - self._correct
- self._tp_num = len(self._tp)
- self._fp_num = len(self._fp)
- self._fn_num = len(self._fn)
- self._measuresNeedUpdate = False
- def score(self, correct, guessed):
- """
- Given a correctly chunked sentence, score another chunked
- version of the same sentence.
- :type correct: chunk structure
- :param correct: The known-correct ("gold standard") chunked
- sentence.
- :type guessed: chunk structure
- :param guessed: The chunked sentence to be scored.
- """
- self._correct |= _chunksets(correct, self._count, self._chunk_label)
- self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
- self._count += 1
- self._measuresNeedUpdate = True
- # Keep track of per-tag accuracy (if possible)
- try:
- correct_tags = tree2conlltags(correct)
- guessed_tags = tree2conlltags(guessed)
- except ValueError:
- # This exception case is for nested chunk structures,
- # where tree2conlltags will fail with a ValueError: "Tree
- # is too deeply nested to be printed in CoNLL format."
- correct_tags = guessed_tags = ()
- self._tags_total += len(correct_tags)
- self._tags_correct += sum(
- 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
- )
- def accuracy(self):
- """
- Return the overall tag-based accuracy for all text that have
- been scored by this ``ChunkScore``, using the IOB (conll2000)
- tag encoding.
- :rtype: float
- """
- if self._tags_total == 0:
- return 1
- return self._tags_correct / self._tags_total
- def precision(self):
- """
- Return the overall precision for all texts that have been
- scored by this ``ChunkScore``.
- :rtype: float
- """
- self._updateMeasures()
- div = self._tp_num + self._fp_num
- if div == 0:
- return 0
- else:
- return self._tp_num / div
- def recall(self):
- """
- Return the overall recall for all texts that have been
- scored by this ``ChunkScore``.
- :rtype: float
- """
- self._updateMeasures()
- div = self._tp_num + self._fn_num
- if div == 0:
- return 0
- else:
- return self._tp_num / div
- def f_measure(self, alpha=0.5):
- """
- Return the overall F measure for all texts that have been
- scored by this ``ChunkScore``.
- :param alpha: the relative weighting of precision and recall.
- Larger alpha biases the score towards the precision value,
- while smaller alpha biases the score towards the recall
- value. ``alpha`` should have a value in the range [0,1].
- :type alpha: float
- :rtype: float
- """
- self._updateMeasures()
- p = self.precision()
- r = self.recall()
- if p == 0 or r == 0: # what if alpha is 0 or 1?
- return 0
- return 1 / (alpha / p + (1 - alpha) / r)
- def missed(self):
- """
- Return the chunks which were included in the
- correct chunk structures, but not in the guessed chunk
- structures, listed in input order.
- :rtype: list of chunks
- """
- self._updateMeasures()
- chunks = list(self._fn)
- return [c[1] for c in chunks] # discard position information
- def incorrect(self):
- """
- Return the chunks which were included in the guessed chunk structures,
- but not in the correct chunk structures, listed in input order.
- :rtype: list of chunks
- """
- self._updateMeasures()
- chunks = list(self._fp)
- return [c[1] for c in chunks] # discard position information
- def correct(self):
- """
- Return the chunks which were included in the correct
- chunk structures, listed in input order.
- :rtype: list of chunks
- """
- chunks = list(self._correct)
- return [c[1] for c in chunks] # discard position information
- def guessed(self):
- """
- Return the chunks which were included in the guessed
- chunk structures, listed in input order.
- :rtype: list of chunks
- """
- chunks = list(self._guessed)
- return [c[1] for c in chunks] # discard position information
- def __len__(self):
- self._updateMeasures()
- return self._tp_num + self._fn_num
- def __repr__(self):
- """
- Return a concise representation of this ``ChunkScoring``.
- :rtype: str
- """
- return '<ChunkScoring of ' + repr(len(self)) + ' chunks>'
- def __str__(self):
- """
- Return a verbose representation of this ``ChunkScoring``.
- This representation includes the precision, recall, and
- f-measure scores. For other information about the score,
- use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
- :rtype: str
- """
- return (
- "ChunkParse score:\n"
- + (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
- + (" Precision: {:5.1f}%%\n".format(self.precision() * 100))
- + (" Recall: {:5.1f}%%\n".format(self.recall() * 100))
- + (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))
- )
- # extract chunks, and assign unique id, the absolute position of
- # the first word of the chunk
- def _chunksets(t, count, chunk_label):
- pos = 0
- chunks = []
- for child in t:
- if isinstance(child, Tree):
- if re.match(chunk_label, child.label()):
- chunks.append(((count, pos), child.freeze()))
- pos += len(child.leaves())
- else:
- pos += 1
- return set(chunks)
- def tagstr2tree(
- s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
- ):
- """
- Divide a string of bracketted tagged text into
- chunks and unchunked tokens, and produce a Tree.
- Chunks are marked by square brackets (``[...]``). Words are
- delimited by whitespace, and each word should have the form
- ``text/tag``. Words that do not contain a slash are
- assigned a ``tag`` of None.
- :param s: The string to be converted
- :type s: str
- :param chunk_label: The label to use for chunk nodes
- :type chunk_label: str
- :param root_label: The label to use for the root of the tree
- :type root_label: str
- :rtype: Tree
- """
- WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
- stack = [Tree(root_label, [])]
- for match in WORD_OR_BRACKET.finditer(s):
- text = match.group()
- if text[0] == '[':
- if len(stack) != 1:
- raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
- chunk = Tree(chunk_label, [])
- stack[-1].append(chunk)
- stack.append(chunk)
- elif text[0] == ']':
- if len(stack) != 2:
- raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
- stack.pop()
- else:
- if sep is None:
- stack[-1].append(text)
- else:
- word, tag = str2tuple(text, sep)
- if source_tagset and target_tagset:
- tag = map_tag(source_tagset, target_tagset, tag)
- stack[-1].append((word, tag))
- if len(stack) != 1:
- raise ValueError('Expected ] at char {:d}'.format(len(s)))
- return stack[0]
- ### CONLL
- _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
- def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
- """
- Return a chunk structure for a single sentence
- encoded in the given CONLL 2000 style string.
- This function converts a CoNLL IOB string into a tree.
- It uses the specified chunk types
- (defaults to NP, PP and VP), and creates a tree rooted at a node
- labeled S (by default).
- :param s: The CoNLL string to be converted.
- :type s: str
- :param chunk_types: The chunk types to be converted.
- :type chunk_types: tuple
- :param root_label: The node label to use for the root.
- :type root_label: str
- :rtype: Tree
- """
- stack = [Tree(root_label, [])]
- for lineno, line in enumerate(s.split('\n')):
- if not line.strip():
- continue
- # Decode the line.
- match = _LINE_RE.match(line)
- if match is None:
- raise ValueError('Error on line {:d}'.format(lineno))
- (word, tag, state, chunk_type) = match.groups()
- # If it's a chunk type we don't care about, treat it as O.
- if chunk_types is not None and chunk_type not in chunk_types:
- state = 'O'
- # For "Begin"/"Outside", finish any completed chunks -
- # also do so for "Inside" which don't match the previous token.
- mismatch_I = state == 'I' and chunk_type != stack[-1].label()
- if state in 'BO' or mismatch_I:
- if len(stack) == 2:
- stack.pop()
- # For "Begin", start a new chunk.
- if state == 'B' or mismatch_I:
- chunk = Tree(chunk_type, [])
- stack[-1].append(chunk)
- stack.append(chunk)
- # Add the new word token.
- stack[-1].append((word, tag))
- return stack[0]
- def tree2conlltags(t):
- """
- Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
- Convert a tree to the CoNLL IOB tag format.
- :param t: The tree to be converted.
- :type t: Tree
- :rtype: list(tuple)
- """
- tags = []
- for child in t:
- try:
- category = child.label()
- prefix = "B-"
- for contents in child:
- if isinstance(contents, Tree):
- raise ValueError(
- "Tree is too deeply nested to be printed in CoNLL format"
- )
- tags.append((contents[0], contents[1], prefix + category))
- prefix = "I-"
- except AttributeError:
- tags.append((child[0], child[1], "O"))
- return tags
- def conlltags2tree(
- sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False
- ):
- """
- Convert the CoNLL IOB format to a tree.
- """
- tree = Tree(root_label, [])
- for (word, postag, chunktag) in sentence:
- if chunktag is None:
- if strict:
- raise ValueError("Bad conll tag sequence")
- else:
- # Treat as O
- tree.append((word, postag))
- elif chunktag.startswith('B-'):
- tree.append(Tree(chunktag[2:], [(word, postag)]))
- elif chunktag.startswith('I-'):
- if (
- len(tree) == 0
- or not isinstance(tree[-1], Tree)
- or tree[-1].label() != chunktag[2:]
- ):
- if strict:
- raise ValueError("Bad conll tag sequence")
- else:
- # Treat as B-*
- tree.append(Tree(chunktag[2:], [(word, postag)]))
- else:
- tree[-1].append((word, postag))
- elif chunktag == 'O':
- tree.append((word, postag))
- else:
- raise ValueError("Bad conll tag {0!r}".format(chunktag))
- return tree
- def tree2conllstr(t):
- """
- Return a multiline string where each line contains a word, tag and IOB tag.
- Convert a tree to the CoNLL IOB string format
- :param t: The tree to be converted.
- :type t: Tree
- :rtype: str
- """
- lines = [" ".join(token) for token in tree2conlltags(t)]
- return '\n'.join(lines)
- ### IEER
- _IEER_DOC_RE = re.compile(
- r'<DOC>\s*'
- r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
- r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
- r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
- r'<BODY>\s*'
- r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
- r'<TEXT>(?P<text>.*?)</TEXT>\s*'
- r'</BODY>\s*</DOC>\s*',
- re.DOTALL,
- )
- _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
- def _ieer_read_text(s, root_label):
- stack = [Tree(root_label, [])]
- # s will be None if there is no headline in the text
- # return the empty list in place of a Tree
- if s is None:
- return []
- for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
- piece = piece_m.group()
- try:
- if piece.startswith('<b_'):
- m = _IEER_TYPE_RE.match(piece)
- if m is None:
- print('XXXX', piece)
- chunk = Tree(m.group('type'), [])
- stack[-1].append(chunk)
- stack.append(chunk)
- elif piece.startswith('<e_'):
- stack.pop()
- # elif piece.startswith('<'):
- # print "ERROR:", piece
- # raise ValueError # Unexpected HTML
- else:
- stack[-1].append(piece)
- except (IndexError, ValueError):
- raise ValueError(
- 'Bad IEER string (error at character {:d})'.format(piece_m.start())
- )
- if len(stack) != 1:
- raise ValueError('Bad IEER string')
- return stack[0]
- def ieerstr2tree(
- s,
- chunk_types=[
- 'LOCATION',
- 'ORGANIZATION',
- 'PERSON',
- 'DURATION',
- 'DATE',
- 'CARDINAL',
- 'PERCENT',
- 'MONEY',
- 'MEASURE',
- ],
- root_label="S",
- ):
- """
- Return a chunk structure containing the chunked tagged text that is
- encoded in the given IEER style string.
- Convert a string of chunked tagged text in the IEER named
- entity format into a chunk structure. Chunks are of several
- types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
- PERCENT, MONEY, and MEASURE.
- :rtype: Tree
- """
- # Try looking for a single document. If that doesn't work, then just
- # treat everything as if it was within the <TEXT>...</TEXT>.
- m = _IEER_DOC_RE.match(s)
- if m:
- return {
- 'text': _ieer_read_text(m.group('text'), root_label),
- 'docno': m.group('docno'),
- 'doctype': m.group('doctype'),
- 'date_time': m.group('date_time'),
- #'headline': m.group('headline')
- # we want to capture NEs in the headline too!
- 'headline': _ieer_read_text(m.group('headline'), root_label),
- }
- else:
- return _ieer_read_text(s, root_label)
- def demo():
- s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
- import nltk
- t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
- t.pprint()
- print()
- s = """
- These DT B-NP
- research NN I-NP
- protocols NNS I-NP
- offer VBP B-VP
- to TO B-PP
- the DT B-NP
- patient NN I-NP
- not RB O
- only RB O
- the DT B-NP
- very RB I-NP
- best JJS I-NP
- therapy NN I-NP
- which WDT B-NP
- we PRP B-NP
- have VBP B-VP
- established VBN I-VP
- today NN B-NP
- but CC B-NP
- also RB I-NP
- the DT B-NP
- hope NN I-NP
- of IN B-PP
- something NN B-NP
- still RB B-ADJP
- better JJR I-ADJP
- . . O
- """
- conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
- conll_tree.pprint()
- # Demonstrate CoNLL output
- print("CoNLL output:")
- print(nltk.chunk.tree2conllstr(conll_tree))
- print()
- if __name__ == '__main__':
- demo()
|