|
@@ -0,0 +1,42 @@
|
|
|
+import PyPDF2
|
|
|
+from tika import parser
|
|
|
+from nltk.tokenize import word_tokenize
|
|
|
+from nltk.corpus import stopwords
|
|
|
+import nltk
|
|
|
+
|
|
|
+
|
|
|
+#write a for-loop to open many files -- leave a comment if you'd #like to learn how
|
|
|
+filename = "GV_12.pdf"
|
|
|
+#open allows you to read the file
|
|
|
+pdfFileObj = open(filename,'rb')
|
|
|
+#The pdfReader variable is a readable object that will be parsed
|
|
|
+pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
|
|
|
+#discerning the number of pages will allow us to parse through all #the pages
|
|
|
+num_pages = pdfReader.numPages
|
|
|
+count = 0
|
|
|
+text = ""
|
|
|
+#The while loop will read each page
|
|
|
+while count < num_pages:
|
|
|
+ pageObj = pdfReader.getPage(count)
|
|
|
+ count +=1
|
|
|
+ text += pageObj.extractText()
|
|
|
+#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
|
|
|
+if text != "":
|
|
|
+ text = text
|
|
|
+#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
|
|
|
+else:
|
|
|
+ raw = parser.from_file("GV_12.pdf")
|
|
|
+ raw = str(raw)
|
|
|
+ safe_text = raw.encode('utf-8', errors='ignore')
|
|
|
+ text = str(safe_text).replace("\n", "").replace("\\", "")
|
|
|
+ print(text)
|
|
|
+
|
|
|
+#The word_tokenize() function will break our text phrases into #individual words
|
|
|
+tokens = word_tokenize(text)
|
|
|
+#we'll create a new list which contains punctuation we wish to clean
|
|
|
+punctuations = ['(',')',';',':','[',']',',']
|
|
|
+#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
|
|
|
+stop_words = stopwords.words('english')
|
|
|
+#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
|
|
|
+keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
|
|
|
+print(keywords)
|