6 years ago · 4b617df867
--- a/read_isos.py
+++ b/read_isos.py
@@ -1,12 +1,16 @@
 
																 import nltk
															
 
																 nltk.download('punkt')
															
 
																 from tika import parser
															
 
																-
															
 
																+einleitung = False
															
 
																 raw = parser.from_file('iso_documents/ISO1101.PDF')
															
 
																 #print(raw['content'])
															
 
																-text = raw
															
 
																+text = raw['content']
															
 
																 sent_text = nltk.sent_tokenize(text)
															
 
																 #tokenized_text = nltk.word_tokenize(sent_text.split)
															
 
																 #tagged = nltk.pos_tag(tokenized_text)
															
 
																 #match = text.concordance('Toleranz')
															
 
																-#print(sent_text)
															
 
																+for text in sent_text:
															
 
																+    if "Toleranz" in text and einleitung is True:
															
 
																+        print(text)
															
 
																+    if "Einleitung" in text:
															
 
																+        einleitung = True
															
--- a/read_tables.py
+++ b/read_tables.py
@@ -0,0 +1,12 @@
 
																+import tabula
															
 
																+
															
 
																+
															
 
																+#tables = tabula.read_pdf("iso_documents/ISO1101.PDF", multiple_tables=True)
															
 
																+#for table in tables:
															
 
																+#    print(table)
															
 
																+
															
 
																+#pdftotext - layout!!!!
															
 
																+
															
 
																+tabula.convert_into("iso_documents/ISO1101.PDF", "output.csv", output_format="csv", pages='all', multiple_tables=True)
															
 
																+df = tabula.read_pdf("iso_documents/ISO1101.PDF", pages='all', multiple_tables=True)
															
 
																+print(df)