bscheibel
/
technical_drawings_extraction


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							import PyPDF2
from tika import parser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


#write a for-loop to open many files -- leave a comment if you'd #like to learn how
filename = "../drawings/GV_12.PDF"
#open allows you to read the file
pdfFileObj = open(filename,'rb')
#The pdfReader variable is a reada2ble object that will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
#discerning the number of pages will allow us to parse through all #the pages
num_pages = pdfReader.numPages
count = 0
text = ""
#The while loop will read each page
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count +=1
    text += pageObj.extractText()
#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
if text != "":
   text = text
#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
else:
    raw = parser.from_file("../drawings/GV_12.PDF")
    raw = str(raw)
    safe_text = raw.encode('utf-8', errors='ignore')
    text = str(safe_text).replace("\n", "").replace("\\", "")
    print(raw)

#The word_tokenize() function will break our text phrases into #individual words
tokens = word_tokenize(text)
#we'll create a new list which contains punctuation we wish to clean
punctuations = ['(',')',';',':','[',']',',']
#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
stop_words = stopwords.words('english')
#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
print(keywords)