Browse Source

print text

beatescheibel 5 years ago
parent
commit
3a5a13af26
6 changed files with 61 additions and 3 deletions
  1. 1 1
      .idea/dxf_reader.iml
  2. 1 1
      .idea/misc.xml
  3. BIN
      GV_12.PDF
  4. 1 1
      merge_pandas.py
  5. 42 0
      ocr_test.py
  6. 16 0
      read_pdf.py

+ 1 - 1
.idea/dxf_reader.iml

@@ -4,7 +4,7 @@
     <content url="file://$MODULE_DIR$">
       <excludeFolder url="file://$MODULE_DIR$/venv" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.7 (dxf_reader)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="TestRunnerService">

+ 1 - 1
.idea/misc.xml

@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (dxf_reader)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
 </project>

BIN
GV_12.PDF


+ 1 - 1
merge_pandas.py

@@ -1,5 +1,5 @@
 import pandas
 df = pandas.read_csv('text.csv', header = 0, delimiter=";")
-df['Text'] = df.groupby(['x','y'])['Text'].transform('sum')
+df['Text'] = df.groupby(['X','Y'])['TEXT'].transform('sum')
 df.drop_duplicates()
 df.to_csv("merged.csv")

+ 42 - 0
ocr_test.py

@@ -0,0 +1,42 @@
+import PyPDF2
+from tika import parser
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+import nltk
+
+
+#write a for-loop to open many files -- leave a comment if you'd #like to learn how
+filename = "GV_12.pdf"
+#open allows you to read the file
+pdfFileObj = open(filename,'rb')
+#The pdfReader variable is a readable object that will be parsed
+pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
+#discerning the number of pages will allow us to parse through all #the pages
+num_pages = pdfReader.numPages
+count = 0
+text = ""
+#The while loop will read each page
+while count < num_pages:
+    pageObj = pdfReader.getPage(count)
+    count +=1
+    text += pageObj.extractText()
+#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
+if text != "":
+   text = text
+#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
+else:
+    raw = parser.from_file("GV_12.pdf")
+    raw = str(raw)
+    safe_text = raw.encode('utf-8', errors='ignore')
+    text = str(safe_text).replace("\n", "").replace("\\", "")
+    print(text)
+
+#The word_tokenize() function will break our text phrases into #individual words
+tokens = word_tokenize(text)
+#we'll create a new list which contains punctuation we wish to clean
+punctuations = ['(',')',';',':','[',']',',']
+#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
+stop_words = stopwords.words('english')
+#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
+keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
+print(keywords)

+ 16 - 0
read_pdf.py

@@ -0,0 +1,16 @@
+# from tika import parser
+#
+# raw = parser.from_file("GV_12.pdf")
+# raw = str(raw)
+#
+# safe_text = raw.encode('utf-8', errors='ignore')
+#
+# #safe_text = str(safe_text).replace("\n", "").replace("\\", "")
+# print('--- safe text ---' )
+# print(safe_text
+#
+# )
+
+
+import textract
+text = textract.process("GV_12.pdf")