read_isos.py 490 B

12345678910111213141516
  1. import nltk
  2. nltk.download('punkt')
  3. from tika import parser
  4. einleitung = False
  5. raw = parser.from_file('iso_documents/ISO1101.PDF')
  6. #print(raw['content'])
  7. text = raw['content']
  8. sent_text = nltk.sent_tokenize(text)
  9. #tokenized_text = nltk.word_tokenize(sent_text.split)
  10. #tagged = nltk.pos_tag(tokenized_text)
  11. #match = text.concordance('Toleranz')
  12. for text in sent_text:
  13. if "Toleranz" in text and einleitung is True:
  14. print(text)
  15. if "Einleitung" in text:
  16. einleitung = True