bllip.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. # Natural Language Toolkit: Interface to BLLIP Parser
  2. #
  3. # Author: David McClosky <dmcc@bigasterisk.com>
  4. #
  5. # Copyright (C) 2001-2019 NLTK Project
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from __future__ import print_function
  9. from nltk.parse.api import ParserI
  10. from nltk.tree import Tree
  11. """
  12. Interface for parsing with BLLIP Parser. Requires the Python
  13. bllipparser module. BllipParser objects can be constructed with the
  14. ``BllipParser.from_unified_model_dir`` class method or manually using the
  15. ``BllipParser`` constructor. The former is generally easier if you have
  16. a BLLIP Parser unified model directory -- a basic model can be obtained
  17. from NLTK's downloader. More unified parsing models can be obtained with
  18. BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher``
  19. or see docs for ``bllipparser.ModelFetcher.download_and_install_model``).
  20. Basic usage::
  21. # download and install a basic unified parsing model (Wall Street Journal)
  22. # sudo python -m nltk.downloader bllip_wsj_no_aux
  23. >>> from nltk.data import find
  24. >>> model_dir = find('models/bllip_wsj_no_aux').path
  25. >>> bllip = BllipParser.from_unified_model_dir(model_dir)
  26. # 1-best parsing
  27. >>> sentence1 = 'British left waffles on Falklands .'.split()
  28. >>> top_parse = bllip.parse_one(sentence1)
  29. >>> print(top_parse)
  30. (S1
  31. (S
  32. (NP (JJ British) (NN left))
  33. (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands))))
  34. (. .)))
  35. # n-best parsing
  36. >>> sentence2 = 'Time flies'.split()
  37. >>> all_parses = bllip.parse_all(sentence2)
  38. >>> print(len(all_parses))
  39. 50
  40. >>> print(all_parses[0])
  41. (S1 (S (NP (NNP Time)) (VP (VBZ flies))))
  42. # incorporating external tagging constraints (None means unconstrained tag)
  43. >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')])
  44. >>> print(next(constrained1))
  45. (S1 (NP (VB Time) (NNS flies)))
  46. >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)])
  47. >>> print(next(constrained2))
  48. (S1 (NP (NN Time) (VBZ flies)))
  49. References
  50. ----------
  51. - Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of
  52. the 1st North American chapter of the Association for Computational
  53. Linguistics conference. Association for Computational Linguistics,
  54. 2000.
  55. - Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing
  56. and MaxEnt discriminative reranking." Proceedings of the 43rd Annual
  57. Meeting on Association for Computational Linguistics. Association
  58. for Computational Linguistics, 2005.
  59. Known issues
  60. ------------
  61. Note that BLLIP Parser is not currently threadsafe. Since this module
  62. uses a SWIG interface, it is potentially unsafe to create multiple
  63. ``BllipParser`` objects in the same process. BLLIP Parser currently
  64. has issues with non-ASCII text and will raise an error if given any.
  65. See http://pypi.python.org/pypi/bllipparser/ for more information
  66. on BLLIP Parser's Python interface.
  67. """
  68. __all__ = ['BllipParser']
  69. # this block allows this module to be imported even if bllipparser isn't
  70. # available
  71. try:
  72. from bllipparser import RerankingParser
  73. from bllipparser.RerankingParser import get_unified_model_parameters
  74. def _ensure_bllip_import_or_error():
  75. pass
  76. except ImportError as ie:
  77. def _ensure_bllip_import_or_error(ie=ie):
  78. raise ImportError("Couldn't import bllipparser module: %s" % ie)
  79. def _ensure_ascii(words):
  80. try:
  81. for i, word in enumerate(words):
  82. word.decode('ascii')
  83. except UnicodeDecodeError:
  84. raise ValueError(
  85. "Token %d (%r) is non-ASCII. BLLIP Parser "
  86. "currently doesn't support non-ASCII inputs." % (i, word)
  87. )
  88. def _scored_parse_to_nltk_tree(scored_parse):
  89. return Tree.fromstring(str(scored_parse.ptb_parse))
  90. class BllipParser(ParserI):
  91. """
  92. Interface for parsing with BLLIP Parser. BllipParser objects can be
  93. constructed with the ``BllipParser.from_unified_model_dir`` class
  94. method or manually using the ``BllipParser`` constructor.
  95. """
  96. def __init__(
  97. self,
  98. parser_model=None,
  99. reranker_features=None,
  100. reranker_weights=None,
  101. parser_options=None,
  102. reranker_options=None,
  103. ):
  104. """
  105. Load a BLLIP Parser model from scratch. You'll typically want to
  106. use the ``from_unified_model_dir()`` class method to construct
  107. this object.
  108. :param parser_model: Path to parser model directory
  109. :type parser_model: str
  110. :param reranker_features: Path the reranker model's features file
  111. :type reranker_features: str
  112. :param reranker_weights: Path the reranker model's weights file
  113. :type reranker_weights: str
  114. :param parser_options: optional dictionary of parser options, see
  115. ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
  116. for more information.
  117. :type parser_options: dict(str)
  118. :param reranker_options: optional
  119. dictionary of reranker options, see
  120. ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
  121. for more information.
  122. :type reranker_options: dict(str)
  123. """
  124. _ensure_bllip_import_or_error()
  125. parser_options = parser_options or {}
  126. reranker_options = reranker_options or {}
  127. self.rrp = RerankingParser()
  128. self.rrp.load_parser_model(parser_model, **parser_options)
  129. if reranker_features and reranker_weights:
  130. self.rrp.load_reranker_model(
  131. features_filename=reranker_features,
  132. weights_filename=reranker_weights,
  133. **reranker_options
  134. )
  135. def parse(self, sentence):
  136. """
  137. Use BLLIP Parser to parse a sentence. Takes a sentence as a list
  138. of words; it will be automatically tagged with this BLLIP Parser
  139. instance's tagger.
  140. :return: An iterator that generates parse trees for the sentence
  141. from most likely to least likely.
  142. :param sentence: The sentence to be parsed
  143. :type sentence: list(str)
  144. :rtype: iter(Tree)
  145. """
  146. _ensure_ascii(sentence)
  147. nbest_list = self.rrp.parse(sentence)
  148. for scored_parse in nbest_list:
  149. yield _scored_parse_to_nltk_tree(scored_parse)
  150. def tagged_parse(self, word_and_tag_pairs):
  151. """
  152. Use BLLIP to parse a sentence. Takes a sentence as a list of
  153. (word, tag) tuples; the sentence must have already been tokenized
  154. and tagged. BLLIP will attempt to use the tags provided but may
  155. use others if it can't come up with a complete parse subject
  156. to those constraints. You may also specify a tag as ``None``
  157. to leave a token's tag unconstrained.
  158. :return: An iterator that generates parse trees for the sentence
  159. from most likely to least likely.
  160. :param sentence: Input sentence to parse as (word, tag) pairs
  161. :type sentence: list(tuple(str, str))
  162. :rtype: iter(Tree)
  163. """
  164. words = []
  165. tag_map = {}
  166. for i, (word, tag) in enumerate(word_and_tag_pairs):
  167. words.append(word)
  168. if tag is not None:
  169. tag_map[i] = tag
  170. _ensure_ascii(words)
  171. nbest_list = self.rrp.parse_tagged(words, tag_map)
  172. for scored_parse in nbest_list:
  173. yield _scored_parse_to_nltk_tree(scored_parse)
  174. @classmethod
  175. def from_unified_model_dir(
  176. cls, model_dir, parser_options=None, reranker_options=None
  177. ):
  178. """
  179. Create a ``BllipParser`` object from a unified parsing model
  180. directory. Unified parsing model directories are a standardized
  181. way of storing BLLIP parser and reranker models together on disk.
  182. See ``bllipparser.RerankingParser.get_unified_model_parameters()``
  183. for more information about unified model directories.
  184. :return: A ``BllipParser`` object using the parser and reranker
  185. models in the model directory.
  186. :param model_dir: Path to the unified model directory.
  187. :type model_dir: str
  188. :param parser_options: optional dictionary of parser options, see
  189. ``bllipparser.RerankingParser.RerankingParser.load_parser_options()``
  190. for more information.
  191. :type parser_options: dict(str)
  192. :param reranker_options: optional dictionary of reranker options, see
  193. ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()``
  194. for more information.
  195. :type reranker_options: dict(str)
  196. :rtype: BllipParser
  197. """
  198. (
  199. parser_model_dir,
  200. reranker_features_filename,
  201. reranker_weights_filename,
  202. ) = get_unified_model_parameters(model_dir)
  203. return cls(
  204. parser_model_dir,
  205. reranker_features_filename,
  206. reranker_weights_filename,
  207. parser_options,
  208. reranker_options,
  209. )
  210. def demo():
  211. """This assumes the Python module bllipparser is installed."""
  212. # download and install a basic unified parsing model (Wall Street Journal)
  213. # sudo python -m nltk.downloader bllip_wsj_no_aux
  214. from nltk.data import find
  215. model_dir = find('models/bllip_wsj_no_aux').path
  216. print('Loading BLLIP Parsing models...')
  217. # the easiest way to get started is to use a unified model
  218. bllip = BllipParser.from_unified_model_dir(model_dir)
  219. print('Done.')
  220. sentence1 = 'British left waffles on Falklands .'.split()
  221. sentence2 = 'I saw the man with the telescope .'.split()
  222. # this sentence is known to fail under the WSJ parsing model
  223. fail1 = '# ! ? : -'.split()
  224. for sentence in (sentence1, sentence2, fail1):
  225. print('Sentence: %r' % ' '.join(sentence))
  226. try:
  227. tree = next(bllip.parse(sentence))
  228. print(tree)
  229. except StopIteration:
  230. print("(parse failed)")
  231. # n-best parsing demo
  232. for i, parse in enumerate(bllip.parse(sentence1)):
  233. print('parse %d:\n%s' % (i, parse))
  234. # using external POS tag constraints
  235. print(
  236. "forcing 'tree' to be 'NN':",
  237. next(bllip.tagged_parse([('A', None), ('tree', 'NN')])),
  238. )
  239. print(
  240. "forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
  241. next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])),
  242. )
  243. # constraints don't have to make sense... (though on more complicated
  244. # sentences, they may cause the parse to fail)
  245. print(
  246. "forcing 'A' to be 'NNP':",
  247. next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])),
  248. )
  249. def setup_module(module):
  250. from nose import SkipTest
  251. try:
  252. _ensure_bllip_import_or_error()
  253. except ImportError:
  254. raise SkipTest(
  255. 'doctests from nltk.parse.bllip are skipped because '
  256. 'the bllipparser module is not installed'
  257. )