bscheibel
/
technical_drawings_extraction


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
							import nltk
import re
from tika import parser
einleitung = False
raw = parser.from_file('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF')
#raw = parser.from_file('iso_documents/ISO286-2.PDF')
print(raw['content'])
#text = raw['content']
#sent_text = nltk.sent_tokenize(text)
#tokenized_text = nltk.word_tokenize(sent_text.split)
#tagged = nltk.pos_tag(tokenized_text)
#match = text.concordance('Toleranz')
#for text in sent_text:
#    if "Toleranz" in text and einleitung is True:
#        print(text)
#    if "Einleitung" in text:
#        einleitung = True


import subprocess
#subprocess.check_output(['ls','-l']) #all that is technically needed...
cmd = 'pdftotext -layout "/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
print(subprocess.Popen(cmd, shell=True))

#convert iso document to text
text = "iso_documents/ISO8015.txt"
#search for table of content with regex
contents = []
regex = r"(.*?)[\W]+(\d+)(?=\n|$)"
r"([^\.]\d\.?\d?\.?\d?\.?\d?)\s([a-zA-Z]*)\s([a-zA-Z]*)\s*([a-zA-Z]*)\W?\s*([a-zA-Z]+)\s*\.{10,}([\d]+)"
matches = re.finditer(regex, text, re.MULTILINE)
#contents = re.findall(r"(.*?)[\W]+(\d+)(?=\n|$)", text, flags=re.M)
#print(contents)
for matchNum, match in enumerate(matches, start=1):

    print("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum=matchNum, start=match.start(),
                                                                        end=match.end(), match=match.group()))

    for groupNum in range(0, len(match.groups())):
        groupNum = groupNum + 1

        print("Group {groupNum} found at {start}-{end}: {group}".format(groupNum=groupNum, start=match.start(groupNum),
                                                                        end=match.end(groupNum),
                                                                        group=match.group(groupNum)))
#only search for sections with toleranzen/abmaße