util.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. # Natural Language Toolkit: Chunk format conversions
  2. #
  3. # Copyright (C) 2001-2019 NLTK Project
  4. # Author: Edward Loper <edloper@gmail.com>
  5. # Steven Bird <stevenbird1@gmail.com> (minor additions)
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import print_function, unicode_literals, division
  9. import re
  10. from nltk.tree import Tree
  11. from nltk.tag.mapping import map_tag
  12. from nltk.tag.util import str2tuple
  13. from nltk.compat import python_2_unicode_compatible
  14. ##//////////////////////////////////////////////////////
  15. ## EVALUATION
  16. ##//////////////////////////////////////////////////////
  17. from nltk.metrics import accuracy as _accuracy
  18. def accuracy(chunker, gold):
  19. """
  20. Score the accuracy of the chunker against the gold standard.
  21. Strip the chunk information from the gold standard and rechunk it using
  22. the chunker, then compute the accuracy score.
  23. :type chunker: ChunkParserI
  24. :param chunker: The chunker being evaluated.
  25. :type gold: tree
  26. :param gold: The chunk structures to score the chunker on.
  27. :rtype: float
  28. """
  29. gold_tags = []
  30. test_tags = []
  31. for gold_tree in gold:
  32. test_tree = chunker.parse(gold_tree.flatten())
  33. gold_tags += tree2conlltags(gold_tree)
  34. test_tags += tree2conlltags(test_tree)
  35. # print 'GOLD:', gold_tags[:50]
  36. # print 'TEST:', test_tags[:50]
  37. return _accuracy(gold_tags, test_tags)
  38. # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
  39. # -- statistics are evaluated only on demand, instead of at every sentence evaluation
  40. #
  41. # SB: use nltk.metrics for precision/recall scoring?
  42. #
  43. class ChunkScore(object):
  44. """
  45. A utility class for scoring chunk parsers. ``ChunkScore`` can
  46. evaluate a chunk parser's output, based on a number of statistics
  47. (precision, recall, f-measure, misssed chunks, incorrect chunks).
  48. It can also combine the scores from the parsing of multiple texts;
  49. this makes it significantly easier to evaluate a chunk parser that
  50. operates one sentence at a time.
  51. Texts are evaluated with the ``score`` method. The results of
  52. evaluation can be accessed via a number of accessor methods, such
  53. as ``precision`` and ``f_measure``. A typical use of the
  54. ``ChunkScore`` class is::
  55. >>> chunkscore = ChunkScore() # doctest: +SKIP
  56. >>> for correct in correct_sentences: # doctest: +SKIP
  57. ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP
  58. ... chunkscore.score(correct, guess) # doctest: +SKIP
  59. >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP
  60. F Measure: 0.823
  61. :ivar kwargs: Keyword arguments:
  62. - max_tp_examples: The maximum number actual examples of true
  63. positives to record. This affects the ``correct`` member
  64. function: ``correct`` will not return more than this number
  65. of true positive examples. This does *not* affect any of
  66. the numerical metrics (precision, recall, or f-measure)
  67. - max_fp_examples: The maximum number actual examples of false
  68. positives to record. This affects the ``incorrect`` member
  69. function and the ``guessed`` member function: ``incorrect``
  70. will not return more than this number of examples, and
  71. ``guessed`` will not return more than this number of true
  72. positive examples. This does *not* affect any of the
  73. numerical metrics (precision, recall, or f-measure)
  74. - max_fn_examples: The maximum number actual examples of false
  75. negatives to record. This affects the ``missed`` member
  76. function and the ``correct`` member function: ``missed``
  77. will not return more than this number of examples, and
  78. ``correct`` will not return more than this number of true
  79. negative examples. This does *not* affect any of the
  80. numerical metrics (precision, recall, or f-measure)
  81. - chunk_label: A regular expression indicating which chunks
  82. should be compared. Defaults to ``'.*'`` (i.e., all chunks).
  83. :type _tp: list(Token)
  84. :ivar _tp: List of true positives
  85. :type _fp: list(Token)
  86. :ivar _fp: List of false positives
  87. :type _fn: list(Token)
  88. :ivar _fn: List of false negatives
  89. :type _tp_num: int
  90. :ivar _tp_num: Number of true positives
  91. :type _fp_num: int
  92. :ivar _fp_num: Number of false positives
  93. :type _fn_num: int
  94. :ivar _fn_num: Number of false negatives.
  95. """
  96. def __init__(self, **kwargs):
  97. self._correct = set()
  98. self._guessed = set()
  99. self._tp = set()
  100. self._fp = set()
  101. self._fn = set()
  102. self._max_tp = kwargs.get('max_tp_examples', 100)
  103. self._max_fp = kwargs.get('max_fp_examples', 100)
  104. self._max_fn = kwargs.get('max_fn_examples', 100)
  105. self._chunk_label = kwargs.get('chunk_label', '.*')
  106. self._tp_num = 0
  107. self._fp_num = 0
  108. self._fn_num = 0
  109. self._count = 0
  110. self._tags_correct = 0.0
  111. self._tags_total = 0.0
  112. self._measuresNeedUpdate = False
  113. def _updateMeasures(self):
  114. if self._measuresNeedUpdate:
  115. self._tp = self._guessed & self._correct
  116. self._fn = self._correct - self._guessed
  117. self._fp = self._guessed - self._correct
  118. self._tp_num = len(self._tp)
  119. self._fp_num = len(self._fp)
  120. self._fn_num = len(self._fn)
  121. self._measuresNeedUpdate = False
  122. def score(self, correct, guessed):
  123. """
  124. Given a correctly chunked sentence, score another chunked
  125. version of the same sentence.
  126. :type correct: chunk structure
  127. :param correct: The known-correct ("gold standard") chunked
  128. sentence.
  129. :type guessed: chunk structure
  130. :param guessed: The chunked sentence to be scored.
  131. """
  132. self._correct |= _chunksets(correct, self._count, self._chunk_label)
  133. self._guessed |= _chunksets(guessed, self._count, self._chunk_label)
  134. self._count += 1
  135. self._measuresNeedUpdate = True
  136. # Keep track of per-tag accuracy (if possible)
  137. try:
  138. correct_tags = tree2conlltags(correct)
  139. guessed_tags = tree2conlltags(guessed)
  140. except ValueError:
  141. # This exception case is for nested chunk structures,
  142. # where tree2conlltags will fail with a ValueError: "Tree
  143. # is too deeply nested to be printed in CoNLL format."
  144. correct_tags = guessed_tags = ()
  145. self._tags_total += len(correct_tags)
  146. self._tags_correct += sum(
  147. 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g
  148. )
  149. def accuracy(self):
  150. """
  151. Return the overall tag-based accuracy for all text that have
  152. been scored by this ``ChunkScore``, using the IOB (conll2000)
  153. tag encoding.
  154. :rtype: float
  155. """
  156. if self._tags_total == 0:
  157. return 1
  158. return self._tags_correct / self._tags_total
  159. def precision(self):
  160. """
  161. Return the overall precision for all texts that have been
  162. scored by this ``ChunkScore``.
  163. :rtype: float
  164. """
  165. self._updateMeasures()
  166. div = self._tp_num + self._fp_num
  167. if div == 0:
  168. return 0
  169. else:
  170. return self._tp_num / div
  171. def recall(self):
  172. """
  173. Return the overall recall for all texts that have been
  174. scored by this ``ChunkScore``.
  175. :rtype: float
  176. """
  177. self._updateMeasures()
  178. div = self._tp_num + self._fn_num
  179. if div == 0:
  180. return 0
  181. else:
  182. return self._tp_num / div
  183. def f_measure(self, alpha=0.5):
  184. """
  185. Return the overall F measure for all texts that have been
  186. scored by this ``ChunkScore``.
  187. :param alpha: the relative weighting of precision and recall.
  188. Larger alpha biases the score towards the precision value,
  189. while smaller alpha biases the score towards the recall
  190. value. ``alpha`` should have a value in the range [0,1].
  191. :type alpha: float
  192. :rtype: float
  193. """
  194. self._updateMeasures()
  195. p = self.precision()
  196. r = self.recall()
  197. if p == 0 or r == 0: # what if alpha is 0 or 1?
  198. return 0
  199. return 1 / (alpha / p + (1 - alpha) / r)
  200. def missed(self):
  201. """
  202. Return the chunks which were included in the
  203. correct chunk structures, but not in the guessed chunk
  204. structures, listed in input order.
  205. :rtype: list of chunks
  206. """
  207. self._updateMeasures()
  208. chunks = list(self._fn)
  209. return [c[1] for c in chunks] # discard position information
  210. def incorrect(self):
  211. """
  212. Return the chunks which were included in the guessed chunk structures,
  213. but not in the correct chunk structures, listed in input order.
  214. :rtype: list of chunks
  215. """
  216. self._updateMeasures()
  217. chunks = list(self._fp)
  218. return [c[1] for c in chunks] # discard position information
  219. def correct(self):
  220. """
  221. Return the chunks which were included in the correct
  222. chunk structures, listed in input order.
  223. :rtype: list of chunks
  224. """
  225. chunks = list(self._correct)
  226. return [c[1] for c in chunks] # discard position information
  227. def guessed(self):
  228. """
  229. Return the chunks which were included in the guessed
  230. chunk structures, listed in input order.
  231. :rtype: list of chunks
  232. """
  233. chunks = list(self._guessed)
  234. return [c[1] for c in chunks] # discard position information
  235. def __len__(self):
  236. self._updateMeasures()
  237. return self._tp_num + self._fn_num
  238. def __repr__(self):
  239. """
  240. Return a concise representation of this ``ChunkScoring``.
  241. :rtype: str
  242. """
  243. return '<ChunkScoring of ' + repr(len(self)) + ' chunks>'
  244. def __str__(self):
  245. """
  246. Return a verbose representation of this ``ChunkScoring``.
  247. This representation includes the precision, recall, and
  248. f-measure scores. For other information about the score,
  249. use the accessor methods (e.g., ``missed()`` and ``incorrect()``).
  250. :rtype: str
  251. """
  252. return (
  253. "ChunkParse score:\n"
  254. + (" IOB Accuracy: {:5.1f}%%\n".format(self.accuracy() * 100))
  255. + (" Precision: {:5.1f}%%\n".format(self.precision() * 100))
  256. + (" Recall: {:5.1f}%%\n".format(self.recall() * 100))
  257. + (" F-Measure: {:5.1f}%%".format(self.f_measure() * 100))
  258. )
  259. # extract chunks, and assign unique id, the absolute position of
  260. # the first word of the chunk
  261. def _chunksets(t, count, chunk_label):
  262. pos = 0
  263. chunks = []
  264. for child in t:
  265. if isinstance(child, Tree):
  266. if re.match(chunk_label, child.label()):
  267. chunks.append(((count, pos), child.freeze()))
  268. pos += len(child.leaves())
  269. else:
  270. pos += 1
  271. return set(chunks)
  272. def tagstr2tree(
  273. s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
  274. ):
  275. """
  276. Divide a string of bracketted tagged text into
  277. chunks and unchunked tokens, and produce a Tree.
  278. Chunks are marked by square brackets (``[...]``). Words are
  279. delimited by whitespace, and each word should have the form
  280. ``text/tag``. Words that do not contain a slash are
  281. assigned a ``tag`` of None.
  282. :param s: The string to be converted
  283. :type s: str
  284. :param chunk_label: The label to use for chunk nodes
  285. :type chunk_label: str
  286. :param root_label: The label to use for the root of the tree
  287. :type root_label: str
  288. :rtype: Tree
  289. """
  290. WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')
  291. stack = [Tree(root_label, [])]
  292. for match in WORD_OR_BRACKET.finditer(s):
  293. text = match.group()
  294. if text[0] == '[':
  295. if len(stack) != 1:
  296. raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
  297. chunk = Tree(chunk_label, [])
  298. stack[-1].append(chunk)
  299. stack.append(chunk)
  300. elif text[0] == ']':
  301. if len(stack) != 2:
  302. raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
  303. stack.pop()
  304. else:
  305. if sep is None:
  306. stack[-1].append(text)
  307. else:
  308. word, tag = str2tuple(text, sep)
  309. if source_tagset and target_tagset:
  310. tag = map_tag(source_tagset, target_tagset, tag)
  311. stack[-1].append((word, tag))
  312. if len(stack) != 1:
  313. raise ValueError('Expected ] at char {:d}'.format(len(s)))
  314. return stack[0]
  315. ### CONLL
  316. _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?')
  317. def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
  318. """
  319. Return a chunk structure for a single sentence
  320. encoded in the given CONLL 2000 style string.
  321. This function converts a CoNLL IOB string into a tree.
  322. It uses the specified chunk types
  323. (defaults to NP, PP and VP), and creates a tree rooted at a node
  324. labeled S (by default).
  325. :param s: The CoNLL string to be converted.
  326. :type s: str
  327. :param chunk_types: The chunk types to be converted.
  328. :type chunk_types: tuple
  329. :param root_label: The node label to use for the root.
  330. :type root_label: str
  331. :rtype: Tree
  332. """
  333. stack = [Tree(root_label, [])]
  334. for lineno, line in enumerate(s.split('\n')):
  335. if not line.strip():
  336. continue
  337. # Decode the line.
  338. match = _LINE_RE.match(line)
  339. if match is None:
  340. raise ValueError('Error on line {:d}'.format(lineno))
  341. (word, tag, state, chunk_type) = match.groups()
  342. # If it's a chunk type we don't care about, treat it as O.
  343. if chunk_types is not None and chunk_type not in chunk_types:
  344. state = 'O'
  345. # For "Begin"/"Outside", finish any completed chunks -
  346. # also do so for "Inside" which don't match the previous token.
  347. mismatch_I = state == 'I' and chunk_type != stack[-1].label()
  348. if state in 'BO' or mismatch_I:
  349. if len(stack) == 2:
  350. stack.pop()
  351. # For "Begin", start a new chunk.
  352. if state == 'B' or mismatch_I:
  353. chunk = Tree(chunk_type, [])
  354. stack[-1].append(chunk)
  355. stack.append(chunk)
  356. # Add the new word token.
  357. stack[-1].append((word, tag))
  358. return stack[0]
  359. def tree2conlltags(t):
  360. """
  361. Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
  362. Convert a tree to the CoNLL IOB tag format.
  363. :param t: The tree to be converted.
  364. :type t: Tree
  365. :rtype: list(tuple)
  366. """
  367. tags = []
  368. for child in t:
  369. try:
  370. category = child.label()
  371. prefix = "B-"
  372. for contents in child:
  373. if isinstance(contents, Tree):
  374. raise ValueError(
  375. "Tree is too deeply nested to be printed in CoNLL format"
  376. )
  377. tags.append((contents[0], contents[1], prefix + category))
  378. prefix = "I-"
  379. except AttributeError:
  380. tags.append((child[0], child[1], "O"))
  381. return tags
  382. def conlltags2tree(
  383. sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False
  384. ):
  385. """
  386. Convert the CoNLL IOB format to a tree.
  387. """
  388. tree = Tree(root_label, [])
  389. for (word, postag, chunktag) in sentence:
  390. if chunktag is None:
  391. if strict:
  392. raise ValueError("Bad conll tag sequence")
  393. else:
  394. # Treat as O
  395. tree.append((word, postag))
  396. elif chunktag.startswith('B-'):
  397. tree.append(Tree(chunktag[2:], [(word, postag)]))
  398. elif chunktag.startswith('I-'):
  399. if (
  400. len(tree) == 0
  401. or not isinstance(tree[-1], Tree)
  402. or tree[-1].label() != chunktag[2:]
  403. ):
  404. if strict:
  405. raise ValueError("Bad conll tag sequence")
  406. else:
  407. # Treat as B-*
  408. tree.append(Tree(chunktag[2:], [(word, postag)]))
  409. else:
  410. tree[-1].append((word, postag))
  411. elif chunktag == 'O':
  412. tree.append((word, postag))
  413. else:
  414. raise ValueError("Bad conll tag {0!r}".format(chunktag))
  415. return tree
  416. def tree2conllstr(t):
  417. """
  418. Return a multiline string where each line contains a word, tag and IOB tag.
  419. Convert a tree to the CoNLL IOB string format
  420. :param t: The tree to be converted.
  421. :type t: Tree
  422. :rtype: str
  423. """
  424. lines = [" ".join(token) for token in tree2conlltags(t)]
  425. return '\n'.join(lines)
  426. ### IEER
  427. _IEER_DOC_RE = re.compile(
  428. r'<DOC>\s*'
  429. r'(<DOCNO>\s*(?P<docno>.+?)\s*</DOCNO>\s*)?'
  430. r'(<DOCTYPE>\s*(?P<doctype>.+?)\s*</DOCTYPE>\s*)?'
  431. r'(<DATE_TIME>\s*(?P<date_time>.+?)\s*</DATE_TIME>\s*)?'
  432. r'<BODY>\s*'
  433. r'(<HEADLINE>\s*(?P<headline>.+?)\s*</HEADLINE>\s*)?'
  434. r'<TEXT>(?P<text>.*?)</TEXT>\s*'
  435. r'</BODY>\s*</DOC>\s*',
  436. re.DOTALL,
  437. )
  438. _IEER_TYPE_RE = re.compile('<b_\w+\s+[^>]*?type="(?P<type>\w+)"')
  439. def _ieer_read_text(s, root_label):
  440. stack = [Tree(root_label, [])]
  441. # s will be None if there is no headline in the text
  442. # return the empty list in place of a Tree
  443. if s is None:
  444. return []
  445. for piece_m in re.finditer('<[^>]+>|[^\s<]+', s):
  446. piece = piece_m.group()
  447. try:
  448. if piece.startswith('<b_'):
  449. m = _IEER_TYPE_RE.match(piece)
  450. if m is None:
  451. print('XXXX', piece)
  452. chunk = Tree(m.group('type'), [])
  453. stack[-1].append(chunk)
  454. stack.append(chunk)
  455. elif piece.startswith('<e_'):
  456. stack.pop()
  457. # elif piece.startswith('<'):
  458. # print "ERROR:", piece
  459. # raise ValueError # Unexpected HTML
  460. else:
  461. stack[-1].append(piece)
  462. except (IndexError, ValueError):
  463. raise ValueError(
  464. 'Bad IEER string (error at character {:d})'.format(piece_m.start())
  465. )
  466. if len(stack) != 1:
  467. raise ValueError('Bad IEER string')
  468. return stack[0]
  469. def ieerstr2tree(
  470. s,
  471. chunk_types=[
  472. 'LOCATION',
  473. 'ORGANIZATION',
  474. 'PERSON',
  475. 'DURATION',
  476. 'DATE',
  477. 'CARDINAL',
  478. 'PERCENT',
  479. 'MONEY',
  480. 'MEASURE',
  481. ],
  482. root_label="S",
  483. ):
  484. """
  485. Return a chunk structure containing the chunked tagged text that is
  486. encoded in the given IEER style string.
  487. Convert a string of chunked tagged text in the IEER named
  488. entity format into a chunk structure. Chunks are of several
  489. types, LOCATION, ORGANIZATION, PERSON, DURATION, DATE, CARDINAL,
  490. PERCENT, MONEY, and MEASURE.
  491. :rtype: Tree
  492. """
  493. # Try looking for a single document. If that doesn't work, then just
  494. # treat everything as if it was within the <TEXT>...</TEXT>.
  495. m = _IEER_DOC_RE.match(s)
  496. if m:
  497. return {
  498. 'text': _ieer_read_text(m.group('text'), root_label),
  499. 'docno': m.group('docno'),
  500. 'doctype': m.group('doctype'),
  501. 'date_time': m.group('date_time'),
  502. #'headline': m.group('headline')
  503. # we want to capture NEs in the headline too!
  504. 'headline': _ieer_read_text(m.group('headline'), root_label),
  505. }
  506. else:
  507. return _ieer_read_text(s, root_label)
  508. def demo():
  509. s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./."
  510. import nltk
  511. t = nltk.chunk.tagstr2tree(s, chunk_label='NP')
  512. t.pprint()
  513. print()
  514. s = """
  515. These DT B-NP
  516. research NN I-NP
  517. protocols NNS I-NP
  518. offer VBP B-VP
  519. to TO B-PP
  520. the DT B-NP
  521. patient NN I-NP
  522. not RB O
  523. only RB O
  524. the DT B-NP
  525. very RB I-NP
  526. best JJS I-NP
  527. therapy NN I-NP
  528. which WDT B-NP
  529. we PRP B-NP
  530. have VBP B-VP
  531. established VBN I-VP
  532. today NN B-NP
  533. but CC B-NP
  534. also RB I-NP
  535. the DT B-NP
  536. hope NN I-NP
  537. of IN B-PP
  538. something NN B-NP
  539. still RB B-ADJP
  540. better JJR I-ADJP
  541. . . O
  542. """
  543. conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP'))
  544. conll_tree.pprint()
  545. # Demonstrate CoNLL output
  546. print("CoNLL output:")
  547. print(nltk.chunk.tree2conllstr(conll_tree))
  548. print()
  549. if __name__ == '__main__':
  550. demo()