- import nltk
- nltk.download('punkt')
- from tika import parser
- raw = parser.from_file('iso_documents/ISO1101.PDF')
- #print(raw['content'])
- text = raw
- sent_text = nltk.sent_tokenize(text)
- #tokenized_text = nltk.word_tokenize(sent_text.split)
- #tagged = nltk.pos_tag(tokenized_text)
- #match = text.concordance('Toleranz')
- #print(sent_text)
|