123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- ### FIRST READ EACH BLOCK IN AN ARRAY
- from bs4 import BeautifulSoup
- import subprocess
- import redis
- import re
- import json
- def get_bound_box(uuid, file):
- #print(file)
- response = open(file)
- html_doc = response.read()
- response.close()
- html_file = BeautifulSoup(html_doc, 'html.parser')
- all_elements = []
- blocks = html_file.findAll('block')
- for block in blocks:
- list_elements = []
- words = block.findAll('word')
- for word in words:
- word_list = []
- word_list.append(word["xmin"])
- word_list.append(word["ymin"])
- word_list.append(word["xmax"])
- word_list.append(word["ymax"])
- word_list.append(word.string)
- list_elements.append(word_list)
- all_elements.append(list_elements)
- #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
- #### FIRST TRYING XMIN und YMAX
- ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
- new_all_elements = []
- for element in all_elements:
- later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
- abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
- abstand_y = abs(float(element[-1][3])-float(element[0][1]))
- if later_bigger >= -5:
- #print(abstand_x-abstand_y)
- new_all_elements.append(element)
- else:
- new_element = sorted(element, key=lambda k: [float(k[0])])
- new_all_elements.append(new_element)
- """for element in new_all_elements:
- for blub in element:
- #print(blub[4])
- #print("\n")"""
- return new_all_elements, uuid
- def pdf_to_html(uuid,filepath):
- subprocess.call(['pdftotext', '-bbox-layout',
- filepath, str(uuid)+'out.html'])
- def extract_isos(result):
- reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
- details_ = []
- for element in result:
- new_arr = ""
- #print(element)
- for x in element:
- new_arr += x[4] + " "
- #print(new_arr)
- if re.search(reg,new_arr):
- #print(new_arr)
- found = re.findall(reg, new_arr)
- for f in found:
- if len(f[0]) != 0:
- details_.append(f[0].replace(")",""))
- if len(f[1]) != 0:
- details_.append(f[1])
- return details_
- def main(uuid, result):
- pdf_to_html(uuid, result)
- res, uuid = get_bound_box(uuid, str(uuid)+"out.html")
- isos = extract_isos(res)
- isos_j = json.dumps(isos)
- db = redis.Redis("localhost")
- #print(isos)
- db.set(str(uuid)+"isos", str(isos_j))
- #print(db.get(uuid))
- """file = "/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.html"
- res, uuid = get_bound_box("uuu", file)
- isos = extract_isos(res)
- print(isos)
- #pdf_to_html("/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.pdf")
- """
- #main("uud","/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.pdf")
|