repp.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Interface to the Repp Tokenizer
  3. #
  4. # Copyright (C) 2001-2015 NLTK Project
  5. # Authors: Rebecca Dridan and Stephan Oepen
  6. # Contributors: Liling Tan
  7. #
  8. # URL: <http://nltk.org/>
  9. # For license information, see LICENSE.TXT
  10. from __future__ import unicode_literals, print_function
  11. import os
  12. import re
  13. import sys
  14. import subprocess
  15. import tempfile
  16. from six import text_type
  17. from nltk.data import ZipFilePathPointer
  18. from nltk.internals import find_dir
  19. from nltk.tokenize.api import TokenizerI
  20. class ReppTokenizer(TokenizerI):
  21. """
  22. A class for word tokenization using the REPP parser described in
  23. Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
  24. Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
  25. and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
  26. >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
  27. ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
  28. ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
  29. ... ]
  30. >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
  31. >>> for sent in sents: # doctest: +SKIP
  32. ... tokenizer.tokenize(sent) # doctest: +SKIP
  33. ...
  34. (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
  35. (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
  36. (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
  37. >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
  38. ... print sent # doctest: +SKIP
  39. ...
  40. (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
  41. (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
  42. (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
  43. >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
  44. ... print sent # doctest: +SKIP
  45. ...
  46. [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
  47. [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
  48. [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
  49. """
  50. def __init__(self, repp_dir, encoding='utf8'):
  51. self.repp_dir = self.find_repptokenizer(repp_dir)
  52. # Set a directory to store the temporary files.
  53. self.working_dir = tempfile.gettempdir()
  54. # Set an encoding for the input strings.
  55. self.encoding = encoding
  56. def tokenize(self, sentence):
  57. """
  58. Use Repp to tokenize a single sentence.
  59. :param sentence: A single sentence string.
  60. :type sentence: str
  61. :return: A tuple of tokens.
  62. :rtype: tuple(str)
  63. """
  64. return next(self.tokenize_sents([sentence]))
  65. def tokenize_sents(self, sentences, keep_token_positions=False):
  66. """
  67. Tokenize multiple sentences using Repp.
  68. :param sentences: A list of sentence strings.
  69. :type sentences: list(str)
  70. :return: A list of tuples of tokens
  71. :rtype: iter(tuple(str))
  72. """
  73. with tempfile.NamedTemporaryFile(
  74. prefix='repp_input.', dir=self.working_dir, mode='w', delete=False
  75. ) as input_file:
  76. # Write sentences to temporary input file.
  77. for sent in sentences:
  78. input_file.write(text_type(sent) + '\n')
  79. input_file.close()
  80. # Generate command to run REPP.
  81. cmd = self.generate_repp_command(input_file.name)
  82. # Decode the stdout and strips the ending newline.
  83. repp_output = self._execute(cmd).decode(self.encoding).strip()
  84. for tokenized_sent in self.parse_repp_outputs(repp_output):
  85. if not keep_token_positions:
  86. # Removes token position information.
  87. tokenized_sent, starts, ends = zip(*tokenized_sent)
  88. yield tokenized_sent
  89. def generate_repp_command(self, inputfilename):
  90. """
  91. This module generates the REPP command to be used at the terminal.
  92. :param inputfilename: path to the input file
  93. :type inputfilename: str
  94. """
  95. cmd = [self.repp_dir + '/src/repp']
  96. cmd += ['-c', self.repp_dir + '/erg/repp.set']
  97. cmd += ['--format', 'triple']
  98. cmd += [inputfilename]
  99. return cmd
  100. @staticmethod
  101. def _execute(cmd):
  102. p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  103. stdout, stderr = p.communicate()
  104. return stdout
  105. @staticmethod
  106. def parse_repp_outputs(repp_output):
  107. """
  108. This module parses the tri-tuple format that REPP outputs using the
  109. "--format triple" option and returns an generator with tuple of string
  110. tokens.
  111. :param repp_output:
  112. :type repp_output: type
  113. :return: an iterable of the tokenized sentences as tuples of strings
  114. :rtype: iter(tuple)
  115. """
  116. line_regex = re.compile('^\((\d+), (\d+), (.+)\)$', re.MULTILINE)
  117. for section in repp_output.split('\n\n'):
  118. words_with_positions = [
  119. (token, int(start), int(end))
  120. for start, end, token in line_regex.findall(section)
  121. ]
  122. words = tuple(t[2] for t in words_with_positions)
  123. yield words_with_positions
  124. def find_repptokenizer(self, repp_dirname):
  125. """
  126. A module to find REPP tokenizer binary and its *repp.set* config file.
  127. """
  128. if os.path.exists(repp_dirname): # If a full path is given.
  129. _repp_dir = repp_dirname
  130. else: # Try to find path to REPP directory in environment variables.
  131. _repp_dir = find_dir(repp_dirname, env_vars=('REPP_TOKENIZER',))
  132. # Checks for the REPP binary and erg/repp.set config file.
  133. assert os.path.exists(_repp_dir + '/src/repp')
  134. assert os.path.exists(_repp_dir + '/erg/repp.set')
  135. return _repp_dir