Browse Source

started iso reader

bscheibel 5 years ago
parent
commit
36792a2cc6
2 changed files with 12 additions and 0 deletions
  1. 0 0
      iso_documents/ISO1101.PDF
  2. 12 0
      read_isos.py

iso_documents/ISO 1101.PDF → iso_documents/ISO1101.PDF


+ 12 - 0
read_isos.py

@@ -0,0 +1,12 @@
+import nltk
+nltk.download('punkt')
+from tika import parser
+
+raw = parser.from_file('iso_documents/ISO1101.PDF')
+#print(raw['content'])
+text = raw
+sent_text = nltk.sent_tokenize(text)
+#tokenized_text = nltk.word_tokenize(sent_text.split)
+#tagged = nltk.pos_tag(tokenized_text)
+#match = text.concordance('Toleranz')
+#print(sent_text)