bscheibel
/
technical_drawings_extraction


			
				
					
						
						
							12345678910111213141516
							import nltk
nltk.download('punkt')
from tika import parser
einleitung = False
raw = parser.from_file('iso_documents/ISO1101.PDF')
#print(raw['content'])
text = raw['content']
sent_text = nltk.sent_tokenize(text)
#tokenized_text = nltk.word_tokenize(sent_text.split)
#tagged = nltk.pos_tag(tokenized_text)
#match = text.concordance('Toleranz')
for text in sent_text:
    if "Toleranz" in text and einleitung is True:
        print(text)
    if "Einleitung" in text:
        einleitung = True