123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- # Natural Language Toolkit: Some texts for exploration in chapter 1 of the book
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Steven Bird <stevenbird1@gmail.com>
- #
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import print_function
- from nltk.corpus import (
- gutenberg,
- genesis,
- inaugural,
- nps_chat,
- webtext,
- treebank,
- wordnet,
- )
- from nltk.text import Text
- from nltk.probability import FreqDist
- from nltk.util import bigrams
- print("*** Introductory Examples for the NLTK Book ***")
- print("Loading text1, ..., text9 and sent1, ..., sent9")
- print("Type the name of the text or sentence to view it.")
- print("Type: 'texts()' or 'sents()' to list the materials.")
- text1 = Text(gutenberg.words('melville-moby_dick.txt'))
- print("text1:", text1.name)
- text2 = Text(gutenberg.words('austen-sense.txt'))
- print("text2:", text2.name)
- text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
- print("text3:", text3.name)
- text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
- print("text4:", text4.name)
- text5 = Text(nps_chat.words(), name="Chat Corpus")
- print("text5:", text5.name)
- text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
- print("text6:", text6.name)
- text7 = Text(treebank.words(), name="Wall Street Journal")
- print("text7:", text7.name)
- text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
- print("text8:", text8.name)
- text9 = Text(gutenberg.words('chesterton-thursday.txt'))
- print("text9:", text9.name)
- def texts():
- print("text1:", text1.name)
- print("text2:", text2.name)
- print("text3:", text3.name)
- print("text4:", text4.name)
- print("text5:", text5.name)
- print("text6:", text6.name)
- print("text7:", text7.name)
- print("text8:", text8.name)
- print("text9:", text9.name)
- sent1 = ["Call", "me", "Ishmael", "."]
- sent2 = [
- "The",
- "family",
- "of",
- "Dashwood",
- "had",
- "long",
- "been",
- "settled",
- "in",
- "Sussex",
- ".",
- ]
- sent3 = [
- "In",
- "the",
- "beginning",
- "God",
- "created",
- "the",
- "heaven",
- "and",
- "the",
- "earth",
- ".",
- ]
- sent4 = [
- "Fellow",
- "-",
- "Citizens",
- "of",
- "the",
- "Senate",
- "and",
- "of",
- "the",
- "House",
- "of",
- "Representatives",
- ":",
- ]
- sent5 = [
- "I",
- "have",
- "a",
- "problem",
- "with",
- "people",
- "PMing",
- "me",
- "to",
- "lol",
- "JOIN",
- ]
- sent6 = [
- 'SCENE',
- '1',
- ':',
- '[',
- 'wind',
- ']',
- '[',
- 'clop',
- 'clop',
- 'clop',
- ']',
- 'KING',
- 'ARTHUR',
- ':',
- 'Whoa',
- 'there',
- '!',
- ]
- sent7 = [
- "Pierre",
- "Vinken",
- ",",
- "61",
- "years",
- "old",
- ",",
- "will",
- "join",
- "the",
- "board",
- "as",
- "a",
- "nonexecutive",
- "director",
- "Nov.",
- "29",
- ".",
- ]
- sent8 = [
- '25',
- 'SEXY',
- 'MALE',
- ',',
- 'seeks',
- 'attrac',
- 'older',
- 'single',
- 'lady',
- ',',
- 'for',
- 'discreet',
- 'encounters',
- '.',
- ]
- sent9 = [
- "THE",
- "suburb",
- "of",
- "Saffron",
- "Park",
- "lay",
- "on",
- "the",
- "sunset",
- "side",
- "of",
- "London",
- ",",
- "as",
- "red",
- "and",
- "ragged",
- "as",
- "a",
- "cloud",
- "of",
- "sunset",
- ".",
- ]
- def sents():
- print("sent1:", " ".join(sent1))
- print("sent2:", " ".join(sent2))
- print("sent3:", " ".join(sent3))
- print("sent4:", " ".join(sent4))
- print("sent5:", " ".join(sent5))
- print("sent6:", " ".join(sent6))
- print("sent7:", " ".join(sent7))
- print("sent8:", " ".join(sent8))
- print("sent9:", " ".join(sent9))
|