preprocessing.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. # -*- coding: utf-8 -*-
  2. # Natural Language Toolkit: Language Model Unit Tests
  3. #
  4. # Copyright (C) 2001-2019 NLTK Project
  5. # Author: Ilia Kurenkov <ilia.kurenkov@gmail.com>
  6. # URL: <http://nltk.org/>
  7. # For license information, see LICENSE.TXT
  8. from functools import partial
  9. from itertools import chain
  10. from nltk.util import everygrams, pad_sequence
  11. flatten = chain.from_iterable
  12. pad_both_ends = partial(
  13. pad_sequence,
  14. pad_left=True,
  15. left_pad_symbol="<s>",
  16. pad_right=True,
  17. right_pad_symbol="</s>",
  18. )
  19. pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order.
  20. Following convention <s> pads the start of sentence </s> pads its end.
  21. """
  22. def padded_everygrams(order, sentence):
  23. """Helper with some useful defaults.
  24. Applies pad_both_ends to sentence and follows it up with everygrams.
  25. """
  26. return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order)
  27. def padded_everygram_pipeline(order, text):
  28. """Default preprocessing for a sequence of sentences.
  29. Creates two iterators:
  30. - sentences padded and turned into sequences of `nltk.util.everygrams`
  31. - sentences padded as above and chained together for a flat stream of words
  32. :param order: Largest ngram length produced by `everygrams`.
  33. :param text: Text to iterate over. Expected to be an iterable of sentences:
  34. Iterable[Iterable[str]]
  35. :return: iterator over text as ngrams, iterator over text as vocabulary data
  36. """
  37. padding_fn = partial(pad_both_ends, n=order)
  38. return (
  39. (everygrams(list(padding_fn(sent)), max_len=order) for sent in text),
  40. flatten(map(padding_fn, text)),
  41. )