123456789101112131415161718192021222324252627282930313233343536373839404142 |
- import PyPDF2
- from tika import parser
- from nltk.tokenize import word_tokenize
- from nltk.corpus import stopwords
- import nltk
- #write a for-loop to open many files -- leave a comment if you'd #like to learn how
- filename = "GV_12.pdf"
- #open allows you to read the file
- pdfFileObj = open(filename,'rb')
- #The pdfReader variable is a readable object that will be parsed
- pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
- #discerning the number of pages will allow us to parse through all #the pages
- num_pages = pdfReader.numPages
- count = 0
- text = ""
- #The while loop will read each page
- while count < num_pages:
- pageObj = pdfReader.getPage(count)
- count +=1
- text += pageObj.extractText()
- #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
- if text != "":
- text = text
- #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
- else:
- raw = parser.from_file("GV_12.pdf")
- raw = str(raw)
- safe_text = raw.encode('utf-8', errors='ignore')
- text = str(safe_text).replace("\n", "").replace("\\", "")
- print(text)
- #The word_tokenize() function will break our text phrases into #individual words
- tokens = word_tokenize(text)
- #we'll create a new list which contains punctuation we wish to clean
- punctuations = ['(',')',';',':','[',']',',']
- #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
- stop_words = stopwords.words('english')
- #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
- keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
- print(keywords)
|