ocr_test.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import PyPDF2
  2. from tika import parser
  3. from nltk.tokenize import word_tokenize
  4. from nltk.corpus import stopwords
  5. import nltk
  6. #write a for-loop to open many files -- leave a comment if you'd #like to learn how
  7. filename = "GV_12.pdf"
  8. #open allows you to read the file
  9. pdfFileObj = open(filename,'rb')
  10. #The pdfReader variable is a readable object that will be parsed
  11. pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  12. #discerning the number of pages will allow us to parse through all #the pages
  13. num_pages = pdfReader.numPages
  14. count = 0
  15. text = ""
  16. #The while loop will read each page
  17. while count < num_pages:
  18. pageObj = pdfReader.getPage(count)
  19. count +=1
  20. text += pageObj.extractText()
  21. #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
  22. if text != "":
  23. text = text
  24. #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
  25. else:
  26. raw = parser.from_file("GV_12.pdf")
  27. raw = str(raw)
  28. safe_text = raw.encode('utf-8', errors='ignore')
  29. text = str(safe_text).replace("\n", "").replace("\\", "")
  30. print(text)
  31. #The word_tokenize() function will break our text phrases into #individual words
  32. tokens = word_tokenize(text)
  33. #we'll create a new list which contains punctuation we wish to clean
  34. punctuations = ['(',')',';',':','[',']',',']
  35. #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
  36. stop_words = stopwords.words('english')
  37. #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
  38. keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
  39. print(keywords)