1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- import nltk
- import re
- from tika import parser
- einleitung = False
- raw = parser.from_file('/home/bscheibel/PycharmProjects/engineering_drawings_extraction/iso_documents/ISO2768-1.PDF')
- #raw = parser.from_file('iso_documents/ISO286-2.PDF')
- print(raw['content'])
- #text = raw['content']
- #sent_text = nltk.sent_tokenize(text)
- #tokenized_text = nltk.word_tokenize(sent_text.split)
- #tagged = nltk.pos_tag(tokenized_text)
- #match = text.concordance('Toleranz')
- #for text in sent_text:
- # if "Toleranz" in text and einleitung is True:
- # print(text)
- # if "Einleitung" in text:
- # einleitung = True
- import subprocess
- #subprocess.check_output(['ls','-l']) #all that is technically needed...
- cmd = 'pdftotext -layout "/home/bscheibel/PycharmProjects/engineering_drawings_extraction/iso_documents/ISO8015.PDF"'
- print(subprocess.Popen(cmd, shell=True))
- #convert iso document to text
- text = "iso_documents/ISO8015.txt"
- #search for table of content with regex
- contents = []
- regex = r"(.*?)[\W]+(\d+)(?=\n|$)"
- r"([^\.]\d\.?\d?\.?\d?\.?\d?)\s([a-zA-Z]*)\s([a-zA-Z]*)\s*([a-zA-Z]*)\W?\s*([a-zA-Z]+)\s*\.{10,}([\d]+)"
- matches = re.finditer(regex, text, re.MULTILINE)
- #contents = re.findall(r"(.*?)[\W]+(\d+)(?=\n|$)", text, flags=re.M)
- #print(contents)
- for matchNum, match in enumerate(matches, start=1):
- print("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum=matchNum, start=match.start(),
- end=match.end(), match=match.group()))
- for groupNum in range(0, len(match.groups())):
- groupNum = groupNum + 1
- print("Group {groupNum} found at {start}-{end}: {group}".format(groupNum=groupNum, start=match.start(groupNum),
- end=match.end(groupNum),
- group=match.group(groupNum)))
- #only search for sections with toleranzen/abmaße
|