bnc.doctest 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. .. Copyright (C) 2001-2019 NLTK Project
  2. .. For license information, see LICENSE.TXT
  3. >>> import os.path
  4. >>> from nltk.corpus.reader import BNCCorpusReader
  5. >>> import nltk.test
  6. >>> root = os.path.dirname(nltk.test.__file__)
  7. >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml')
  8. Checking the word access.
  9. -------------------------
  10. >>> len(bnc.words())
  11. 151
  12. >>> bnc.words()[:6]
  13. ['Ah', 'there', 'we', 'are', ',', '.']
  14. >>> bnc.words(stem=True)[:6]
  15. ['ah', 'there', 'we', 'be', ',', '.']
  16. >>> bnc.tagged_words()[:6]
  17. [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
  18. >>> bnc.tagged_words(c5=True)[:6]
  19. [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
  20. Testing access to the sentences.
  21. --------------------------------
  22. >>> len(bnc.sents())
  23. 15
  24. >>> bnc.sents()[0]
  25. ['Ah', 'there', 'we', 'are', ',', '.']
  26. >>> bnc.sents(stem=True)[0]
  27. ['ah', 'there', 'we', 'be', ',', '.']
  28. >>> bnc.tagged_sents()[0]
  29. [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
  30. >>> bnc.tagged_sents(c5=True)[0]
  31. [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
  32. A not lazy loader.
  33. ------------------
  34. >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)
  35. >>> len(eager.words())
  36. 151
  37. >>> eager.words(stem=True)[6:17]
  38. ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
  39. >>> eager.tagged_words()[6:11]
  40. [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')]
  41. >>> eager.tagged_words(c5=True)[6:17]
  42. [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')]
  43. >>> len(eager.sents())
  44. 15