order_bounding_boxes_in_each_block.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. ### FIRST READ EACH BLOCK IN AN ARRAY
  2. from bs4 import BeautifulSoup
  3. import subprocess
  4. import re
  5. def get_bound_box(file):
  6. response = open(file)
  7. html_doc = response.read()
  8. response.close()
  9. html_file = BeautifulSoup(html_doc, 'html.parser')
  10. all_elements = []
  11. blocks = html_file.findAll('block')
  12. number_blocks = len(blocks)
  13. number_words = 0
  14. for block in blocks:
  15. list_elements = []
  16. words = block.findAll('word')
  17. number_words += len(words)
  18. for word in words:
  19. word_list = []
  20. word_list.append(word["xmin"])
  21. word_list.append(word["ymin"])
  22. word_list.append(word["xmax"])
  23. word_list.append(word["ymax"])
  24. word_list.append(word.string)
  25. list_elements.append(word_list)
  26. all_elements.append(list_elements)
  27. #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
  28. #### FIRST TRYING XMIN und YMAX
  29. ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
  30. new_all_elements = []
  31. for element in all_elements:
  32. later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
  33. abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
  34. abstand_y = abs(float(element[-1][3])-float(element[0][1]))
  35. if later_bigger >= -5:
  36. new_all_elements.append(element)
  37. else:
  38. new_element = sorted(element, key=lambda k: [float(k[0])])
  39. new_all_elements.append(new_element)
  40. return new_all_elements, number_blocks, number_words
  41. def pdf_to_html(uuid,filepath, path):
  42. filename = path +"/temporary/" +str(uuid)+"out.html"
  43. subprocess.call(['pdftotext', '-bbox-layout',
  44. filepath, filename])
  45. return filename
  46. def extract_isos(result):
  47. reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
  48. details_ = []
  49. reg_general = r"ISO\s?\d*\s*\W\s*[fmcv][HKL]"
  50. general_tol = ""
  51. for element in result:
  52. new_arr = ""
  53. for x in element:
  54. new_arr += x[4] + " "
  55. if re.search(reg,new_arr):
  56. found = re.findall(reg, new_arr)
  57. for f in found:
  58. if len(f[0]) != 0:
  59. details_.append(f[0].replace(")",""))
  60. if len(f[1]) != 0:
  61. details_.append(f[1])
  62. if re.search(reg_general, new_arr):
  63. general_tol = new_arr
  64. return details_, str(general_tol)
  65. def get_tables(result):
  66. reg = r"(Start drawing)|(All dimensions)"
  67. tables = []
  68. for element in result:
  69. new = []
  70. if re.search(reg, element):
  71. new.extend(result[element])
  72. new.append(element)
  73. tables.append(new)
  74. number = len(tables)
  75. return tables