Bladeren bron

print plus minus

beatescheibel 5 jaren geleden
bovenliggende
commit
4b617df867
2 gewijzigde bestanden met toevoegingen van 19 en 3 verwijderingen
  1. 7 3
      read_isos.py
  2. 12 0
      read_tables.py

+ 7 - 3
read_isos.py

@@ -1,12 +1,16 @@
 import nltk
 nltk.download('punkt')
 from tika import parser
-
+einleitung = False
 raw = parser.from_file('iso_documents/ISO1101.PDF')
 #print(raw['content'])
-text = raw
+text = raw['content']
 sent_text = nltk.sent_tokenize(text)
 #tokenized_text = nltk.word_tokenize(sent_text.split)
 #tagged = nltk.pos_tag(tokenized_text)
 #match = text.concordance('Toleranz')
-#print(sent_text)
+for text in sent_text:
+    if "Toleranz" in text and einleitung is True:
+        print(text)
+    if "Einleitung" in text:
+        einleitung = True

+ 12 - 0
read_tables.py

@@ -0,0 +1,12 @@
+import tabula
+
+
+#tables = tabula.read_pdf("iso_documents/ISO1101.PDF", multiple_tables=True)
+#for table in tables:
+#    print(table)
+
+#pdftotext - layout!!!!
+
+tabula.convert_into("iso_documents/ISO1101.PDF", "output.csv", output_format="csv", pages='all', multiple_tables=True)
+df = tabula.read_pdf("iso_documents/ISO1101.PDF", pages='all', multiple_tables=True)
+print(df)