@@ -1,12 +1,16 @@
import nltk
nltk.download('punkt')
from tika import parser
-
+einleitung = False
raw = parser.from_file('iso_documents/ISO1101.PDF')
#print(raw['content'])
-text = raw
+text = raw['content']
sent_text = nltk.sent_tokenize(text)
#tokenized_text = nltk.word_tokenize(sent_text.split)
#tagged = nltk.pos_tag(tokenized_text)
#match = text.concordance('Toleranz')
-#print(sent_text)
+for text in sent_text:
+ if "Toleranz" in text and einleitung is True:
+ print(text)
+ if "Einleitung" in text:
+ einleitung = True
@@ -0,0 +1,12 @@
+import tabula
+
+#tables = tabula.read_pdf("iso_documents/ISO1101.PDF", multiple_tables=True)
+#for table in tables:
+# print(table)
+#pdftotext - layout!!!!
+tabula.convert_into("iso_documents/ISO1101.PDF", "output.csv", output_format="csv", pages='all', multiple_tables=True)
+df = tabula.read_pdf("iso_documents/ISO1101.PDF", pages='all', multiple_tables=True)
+print(df)