123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- # Natural Language Toolkit: Parser Utility Functions
- #
- # Author: Ewan Klein <ewan@inf.ed.ac.uk>
- #
- # Copyright (C) 2001-2019 NLTK Project
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- Utility functions for parsers.
- """
- from __future__ import print_function
- from nltk.grammar import CFG, FeatureGrammar, PCFG
- from nltk.data import load
- from nltk.parse.chart import Chart, ChartParser
- from nltk.parse.pchart import InsideChartParser
- from nltk.parse.featurechart import FeatureChart, FeatureChartParser
- def load_parser(
- grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args
- ):
- """
- Load a grammar from a file, and build a parser based on that grammar.
- The parser depends on the grammar format, and might also depend
- on properties of the grammar itself.
- The following grammar formats are currently supported:
- - ``'cfg'`` (CFGs: ``CFG``)
- - ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
- - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
- :type grammar_url: str
- :param grammar_url: A URL specifying where the grammar is located.
- The default protocol is ``"nltk:"``, which searches for the file
- in the the NLTK data package.
- :type trace: int
- :param trace: The level of tracing that should be used when
- parsing a text. ``0`` will generate no tracing output;
- and higher numbers will produce more verbose tracing output.
- :param parser: The class used for parsing; should be ``ChartParser``
- or a subclass.
- If None, the class depends on the grammar format.
- :param chart_class: The class used for storing the chart;
- should be ``Chart`` or a subclass.
- Only used for CFGs and feature CFGs.
- If None, the chart class depends on the grammar format.
- :type beam_size: int
- :param beam_size: The maximum length for the parser's edge queue.
- Only used for probabilistic CFGs.
- :param load_args: Keyword parameters used when loading the grammar.
- See ``data.load`` for more information.
- """
- grammar = load(grammar_url, **load_args)
- if not isinstance(grammar, CFG):
- raise ValueError("The grammar must be a CFG, " "or a subclass thereof.")
- if isinstance(grammar, PCFG):
- if parser is None:
- parser = InsideChartParser
- return parser(grammar, trace=trace, beam_size=beam_size)
- elif isinstance(grammar, FeatureGrammar):
- if parser is None:
- parser = FeatureChartParser
- if chart_class is None:
- chart_class = FeatureChart
- return parser(grammar, trace=trace, chart_class=chart_class)
- else: # Plain CFG.
- if parser is None:
- parser = ChartParser
- if chart_class is None:
- chart_class = Chart
- return parser(grammar, trace=trace, chart_class=chart_class)
- def taggedsent_to_conll(sentence):
- """
- A module to convert a single POS tagged sentence into CONLL format.
- >>> from nltk import word_tokenize, pos_tag
- >>> text = "This is a foobar sentence."
- >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))):
- ... print(line, end="")
- 1 This _ DT DT _ 0 a _ _
- 2 is _ VBZ VBZ _ 0 a _ _
- 3 a _ DT DT _ 0 a _ _
- 4 foobar _ JJ JJ _ 0 a _ _
- 5 sentence _ NN NN _ 0 a _ _
- 6 . _ . . _ 0 a _ _
- :param sentence: A single input sentence to parse
- :type sentence: list(tuple(str, str))
- :rtype: iter(str)
- :return: a generator yielding a single sentence in CONLL format.
- """
- for (i, (word, tag)) in enumerate(sentence, start=1):
- input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_']
- input_str = "\t".join(input_str) + "\n"
- yield input_str
- def taggedsents_to_conll(sentences):
- """
- A module to convert the a POS tagged document stream
- (i.e. list of list of tuples, a list of sentences) and yield lines
- in CONLL format. This module yields one line per word and two newlines
- for end of sentence.
- >>> from nltk import word_tokenize, sent_tokenize, pos_tag
- >>> text = "This is a foobar sentence. Is that right?"
- >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)]
- >>> for line in taggedsents_to_conll(sentences):
- ... if line:
- ... print(line, end="")
- 1 This _ DT DT _ 0 a _ _
- 2 is _ VBZ VBZ _ 0 a _ _
- 3 a _ DT DT _ 0 a _ _
- 4 foobar _ JJ JJ _ 0 a _ _
- 5 sentence _ NN NN _ 0 a _ _
- 6 . _ . . _ 0 a _ _
- <BLANKLINE>
- <BLANKLINE>
- 1 Is _ VBZ VBZ _ 0 a _ _
- 2 that _ IN IN _ 0 a _ _
- 3 right _ NN NN _ 0 a _ _
- 4 ? _ . . _ 0 a _ _
- <BLANKLINE>
- <BLANKLINE>
- :param sentences: Input sentences to parse
- :type sentence: list(list(tuple(str, str)))
- :rtype: iter(str)
- :return: a generator yielding sentences in CONLL format.
- """
- for sentence in sentences:
- for input_str in taggedsent_to_conll(sentence):
- yield input_str
- yield '\n\n'
- ######################################################################
- # { Test Suites
- ######################################################################
- class TestGrammar(object):
- """
- Unit tests for CFG.
- """
- def __init__(self, grammar, suite, accept=None, reject=None):
- self.test_grammar = grammar
- self.cp = load_parser(grammar, trace=0)
- self.suite = suite
- self._accept = accept
- self._reject = reject
- def run(self, show_trees=False):
- """
- Sentences in the test suite are divided into two classes:
- - grammatical (``accept``) and
- - ungrammatical (``reject``).
- If a sentence should parse accordng to the grammar, the value of
- ``trees`` will be a non-empty list. If a sentence should be rejected
- according to the grammar, then the value of ``trees`` will be None.
- """
- for test in self.suite:
- print(test['doc'] + ":", end=' ')
- for key in ['accept', 'reject']:
- for sent in test[key]:
- tokens = sent.split()
- trees = list(self.cp.parse(tokens))
- if show_trees and trees:
- print()
- print(sent)
- for tree in trees:
- print(tree)
- if key == 'accept':
- if trees == []:
- raise ValueError("Sentence '%s' failed to parse'" % sent)
- else:
- accepted = True
- else:
- if trees:
- raise ValueError("Sentence '%s' received a parse'" % sent)
- else:
- rejected = True
- if accepted and rejected:
- print("All tests passed!")
- def extract_test_sentences(string, comment_chars="#%;", encoding=None):
- """
- Parses a string with one test sentence per line.
- Lines can optionally begin with:
- - a bool, saying if the sentence is grammatical or not, or
- - an int, giving the number of parse trees is should have,
- The result information is followed by a colon, and then the sentence.
- Empty lines and lines beginning with a comment char are ignored.
- :return: a list of tuple of sentences and expected results,
- where a sentence is a list of str,
- and a result is None, or bool, or int
- :param comment_chars: ``str`` of possible comment characters.
- :param encoding: the encoding of the string, if it is binary
- """
- if encoding is not None:
- string = string.decode(encoding)
- sentences = []
- for sentence in string.split('\n'):
- if sentence == '' or sentence[0] in comment_chars:
- continue
- split_info = sentence.split(':', 1)
- result = None
- if len(split_info) == 2:
- if split_info[0] in ['True', 'true', 'False', 'false']:
- result = split_info[0] in ['True', 'true']
- sentence = split_info[1]
- else:
- result = int(split_info[0])
- sentence = split_info[1]
- tokens = sentence.split()
- if tokens == []:
- continue
- sentences += [(tokens, result)]
- return sentences
- # nose thinks it is a test
- extract_test_sentences.__test__ = False
|