12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- from bs4 import BeautifulSoup
- import subprocess
- import re
- def get_bound_box(file):
- response = open(file)
- html_doc = response.read()
- response.close()
- html_file = BeautifulSoup(html_doc, 'html.parser')
- all_elements = []
- blocks = html_file.findAll('block')
- number_blocks = len(blocks)
- number_words = 0
- for block in blocks:
- list_elements = []
- words = block.findAll('word')
- number_words += len(words)
- for word in words:
- word_list = [word["xmin"], word["ymin"], word["xmax"], word["ymax"], word.string]
- list_elements.append(word_list)
- all_elements.append(list_elements)
- new_all_elements = []
- for element in all_elements:
- later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
- if later_bigger >= -5:
- new_all_elements.append(element)
- else:
- new_element = sorted(element, key=lambda k: [float(k[0])])
- new_all_elements.append(new_element)
- return new_all_elements, number_blocks, number_words
- def pdf_to_html(uuid,filepath, path):
- filename = path +"/temporary/" +str(uuid)+"out.html"
- print(filename)
- subprocess.call(['pdftotext', '-bbox-layout',
- filepath, filename])
- return filename
- def extract_isos(result):
- reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
- details_ = []
- reg_general = r"ISO\s?\d*\s*\W\s*[fmcv][HKL]"
- general_tol = ""
- for element in result:
- new_arr = ""
- for x in element:
- new_arr += x[4] + " "
- if re.search(reg,new_arr):
- found = re.findall(reg, new_arr)
- for f in found:
- if len(f[0]) != 0:
- details_.append(f[0].replace(")",""))
- if len(f[1]) != 0:
- details_.append(f[1])
- if re.search(reg_general, new_arr):
- general_tol = new_arr
- return details_, str(general_tol)
- def get_tables(result):
- reg = r"(Start drawing)|(All dimensions)"
- tables = []
- for element in result:
- new = []
- if re.search(reg, element):
- new.extend(result[element])
- new.append(element)
- tables.append(new)
- number = len(tables)
- return tables
|