read_isos.py 331 B

123456789101112
  1. import nltk
  2. nltk.download('punkt')
  3. from tika import parser
  4. raw = parser.from_file('iso_documents/ISO1101.PDF')
  5. #print(raw['content'])
  6. text = raw
  7. sent_text = nltk.sent_tokenize(text)
  8. #tokenized_text = nltk.word_tokenize(sent_text.split)
  9. #tagged = nltk.pos_tag(tokenized_text)
  10. #match = text.concordance('Toleranz')
  11. #print(sent_text)