|
@@ -1,12 +1,16 @@
|
|
import nltk
|
|
import nltk
|
|
nltk.download('punkt')
|
|
nltk.download('punkt')
|
|
from tika import parser
|
|
from tika import parser
|
|
-
|
|
|
|
|
|
+einleitung = False
|
|
raw = parser.from_file('iso_documents/ISO1101.PDF')
|
|
raw = parser.from_file('iso_documents/ISO1101.PDF')
|
|
#print(raw['content'])
|
|
#print(raw['content'])
|
|
-text = raw
|
|
|
|
|
|
+text = raw['content']
|
|
sent_text = nltk.sent_tokenize(text)
|
|
sent_text = nltk.sent_tokenize(text)
|
|
#tokenized_text = nltk.word_tokenize(sent_text.split)
|
|
#tokenized_text = nltk.word_tokenize(sent_text.split)
|
|
#tagged = nltk.pos_tag(tokenized_text)
|
|
#tagged = nltk.pos_tag(tokenized_text)
|
|
#match = text.concordance('Toleranz')
|
|
#match = text.concordance('Toleranz')
|
|
-#print(sent_text)
|
|
|
|
|
|
+for text in sent_text:
|
|
|
|
+ if "Toleranz" in text and einleitung is True:
|
|
|
|
+ print(text)
|
|
|
|
+ if "Einleitung" in text:
|
|
|
|
+ einleitung = True
|