import PyPDF2 from tika import parser from nltk.tokenize import word_tokenize from nltk.corpus import stopwords import nltk nltk.download('stopwords') #write a for-loop to open many files -- leave a comment if you'd #like to learn how filename = "../drawings/GV_12.PDF" #open allows you to read the file pdfFileObj = open(filename,'rb') #The pdfReader variable is a reada2ble object that will be parsed pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #discerning the number of pages will allow us to parse through all #the pages num_pages = pdfReader.numPages count = 0 text = "" #The while loop will read each page while count < num_pages: pageObj = pdfReader.getPage(count) count +=1 text += pageObj.extractText() #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files. if text != "": text = text #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text else: raw = parser.from_file("../drawings/GV_12.PDF") raw = str(raw) safe_text = raw.encode('utf-8', errors='ignore') text = str(safe_text).replace("\n", "").replace("\\", "") print(raw) #The word_tokenize() function will break our text phrases into #individual words tokens = word_tokenize(text) #we'll create a new list which contains punctuation we wish to clean punctuations = ['(',')',';',':','[',']',','] #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords stop_words = stopwords.words('english') #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations. keywords = [word for word in tokens if not word in stop_words and not word in punctuations] print(keywords)