ocr_test.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import PyPDF2
  2. from tika import parser
  3. from nltk.tokenize import word_tokenize
  4. from nltk.corpus import stopwords
  5. import nltk
  6. nltk.download('stopwords')
  7. #write a for-loop to open many files -- leave a comment if you'd #like to learn how
  8. filename = "../drawings/GV_12.PDF"
  9. #open allows you to read the file
  10. pdfFileObj = open(filename,'rb')
  11. #The pdfReader variable is a reada2ble object that will be parsed
  12. pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
  13. #discerning the number of pages will allow us to parse through all #the pages
  14. num_pages = pdfReader.numPages
  15. count = 0
  16. text = ""
  17. #The while loop will read each page
  18. while count < num_pages:
  19. pageObj = pdfReader.getPage(count)
  20. count +=1
  21. text += pageObj.extractText()
  22. #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
  23. if text != "":
  24. text = text
  25. #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
  26. else:
  27. raw = parser.from_file("../drawings/GV_12.PDF")
  28. raw = str(raw)
  29. safe_text = raw.encode('utf-8', errors='ignore')
  30. text = str(safe_text).replace("\n", "").replace("\\", "")
  31. print(raw)
  32. #The word_tokenize() function will break our text phrases into #individual words
  33. tokens = word_tokenize(text)
  34. #we'll create a new list which contains punctuation we wish to clean
  35. punctuations = ['(',')',';',':','[',']',',']
  36. #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
  37. stop_words = stopwords.words('english')
  38. #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
  39. keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
  40. print(keywords)