senna.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. # encoding: utf-8
  2. # Natural Language Toolkit: Senna Interface
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. """
  9. A general interface to the SENNA pipeline that supports any of the
  10. operations specified in SUPPORTED_OPERATIONS.
  11. Applying multiple operations at once has the speed advantage. For example,
  12. Senna will automatically determine POS tags if you are extracting named
  13. entities. Applying both of the operations will cost only the time of
  14. extracting the named entities.
  15. The SENNA pipeline has a fixed maximum size of the sentences that it can read.
  16. By default it is 1024 token/sentence. If you have larger sentences, changing
  17. the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
  18. system specific binary should be rebuilt. Otherwise this could introduce
  19. misalignment errors.
  20. The input is:
  21. - path to the directory that contains SENNA executables. If the path is incorrect,
  22. Senna will automatically search for executable file specified in SENNA environment variable
  23. - List of the operations needed to be performed.
  24. - (optionally) the encoding of the input data (default:utf-8)
  25. Note: Unit tests for this module can be found in test/unit/test_senna.py
  26. >>> from __future__ import unicode_literals
  27. >>> from nltk.classify import Senna
  28. >>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
  29. >>> sent = 'Dusseldorf is an international business center'.split()
  30. >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
  31. [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
  32. ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
  33. """
  34. from __future__ import unicode_literals
  35. from os import path, sep, environ
  36. from subprocess import Popen, PIPE
  37. from platform import architecture, system
  38. from six import text_type
  39. from nltk.tag.api import TaggerI
  40. from nltk.compat import python_2_unicode_compatible
  41. _senna_url = 'http://ml.nec-labs.com/senna/'
  42. @python_2_unicode_compatible
  43. class Senna(TaggerI):
  44. SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner']
  45. def __init__(self, senna_path, operations, encoding='utf-8'):
  46. self._encoding = encoding
  47. self._path = path.normpath(senna_path) + sep
  48. # Verifies the existence of the executable on the self._path first
  49. # senna_binary_file_1 = self.executable(self._path)
  50. exe_file_1 = self.executable(self._path)
  51. if not path.isfile(exe_file_1):
  52. # Check for the system environment
  53. if 'SENNA' in environ:
  54. # self._path = path.join(environ['SENNA'],'')
  55. self._path = path.normpath(environ['SENNA']) + sep
  56. exe_file_2 = self.executable(self._path)
  57. if not path.isfile(exe_file_2):
  58. raise OSError(
  59. "Senna executable expected at %s or %s but not found"
  60. % (exe_file_1, exe_file_2)
  61. )
  62. self.operations = operations
  63. def executable(self, base_path):
  64. """
  65. The function that determines the system specific binary that should be
  66. used in the pipeline. In case, the system is not known the default senna binary will
  67. be used.
  68. """
  69. os_name = system()
  70. if os_name == 'Linux':
  71. bits = architecture()[0]
  72. if bits == '64bit':
  73. return path.join(base_path, 'senna-linux64')
  74. return path.join(base_path, 'senna-linux32')
  75. if os_name == 'Windows':
  76. return path.join(base_path, 'senna-win32.exe')
  77. if os_name == 'Darwin':
  78. return path.join(base_path, 'senna-osx')
  79. return path.join(base_path, 'senna')
  80. def _map(self):
  81. """
  82. A method that calculates the order of the columns that SENNA pipeline
  83. will output the tags into. This depends on the operations being ordered.
  84. """
  85. _map = {}
  86. i = 1
  87. for operation in Senna.SUPPORTED_OPERATIONS:
  88. if operation in self.operations:
  89. _map[operation] = i
  90. i += 1
  91. return _map
  92. def tag(self, tokens):
  93. """
  94. Applies the specified operation(s) on a list of tokens.
  95. """
  96. return self.tag_sents([tokens])[0]
  97. def tag_sents(self, sentences):
  98. """
  99. Applies the tag method over a list of sentences. This method will return a
  100. list of dictionaries. Every dictionary will contain a word with its
  101. calculated annotations/tags.
  102. """
  103. encoding = self._encoding
  104. if not path.isfile(self.executable(self._path)):
  105. raise OSError(
  106. "Senna executable expected at %s but not found"
  107. % self.executable(self._path)
  108. )
  109. # Build the senna command to run the tagger
  110. _senna_cmd = [
  111. self.executable(self._path),
  112. '-path',
  113. self._path,
  114. '-usrtokens',
  115. '-iobtags',
  116. ]
  117. _senna_cmd.extend(['-' + op for op in self.operations])
  118. # Serialize the actual sentences to a temporary string
  119. _input = '\n'.join((' '.join(x) for x in sentences)) + '\n'
  120. if isinstance(_input, text_type) and encoding:
  121. _input = _input.encode(encoding)
  122. # Run the tagger and get the output
  123. p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
  124. (stdout, stderr) = p.communicate(input=_input)
  125. senna_output = stdout
  126. # Check the return code.
  127. if p.returncode != 0:
  128. raise RuntimeError('Senna command failed! Details: %s' % stderr)
  129. if encoding:
  130. senna_output = stdout.decode(encoding)
  131. # Output the tagged sentences
  132. map_ = self._map()
  133. tagged_sentences = [[]]
  134. sentence_index = 0
  135. token_index = 0
  136. for tagged_word in senna_output.strip().split("\n"):
  137. if not tagged_word:
  138. tagged_sentences.append([])
  139. sentence_index += 1
  140. token_index = 0
  141. continue
  142. tags = tagged_word.split('\t')
  143. result = {}
  144. for tag in map_:
  145. result[tag] = tags[map_[tag]].strip()
  146. try:
  147. result['word'] = sentences[sentence_index][token_index]
  148. except IndexError:
  149. raise IndexError(
  150. "Misalignment error occurred at sentence number %d. Possible reason"
  151. " is that the sentence size exceeded the maximum size. Check the "
  152. "documentation of Senna class for more information."
  153. % sentence_index
  154. )
  155. tagged_sentences[-1].append(result)
  156. token_index += 1
  157. return tagged_sentences
  158. # skip doctests if Senna is not installed
  159. def setup_module(module):
  160. from nose import SkipTest
  161. try:
  162. tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
  163. except OSError:
  164. raise SkipTest("Senna executable not found")