order_bounding_boxes_in_each_block.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. ### FIRST READ EACH BLOCK IN AN ARRAY
  2. from bs4 import BeautifulSoup
  3. import subprocess
  4. import re
  5. def get_bound_box(file):
  6. response = open(file)
  7. html_doc = response.read()
  8. response.close()
  9. html_file = BeautifulSoup(html_doc, 'html.parser')
  10. all_elements = []
  11. blocks = html_file.findAll('block')
  12. for block in blocks:
  13. list_elements = []
  14. words = block.findAll('word')
  15. for word in words:
  16. word_list = []
  17. word_list.append(word["xmin"])
  18. word_list.append(word["ymin"])
  19. word_list.append(word["xmax"])
  20. word_list.append(word["ymax"])
  21. word_list.append(word.string)
  22. list_elements.append(word_list)
  23. all_elements.append(list_elements)
  24. #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
  25. #### FIRST TRYING XMIN und YMAX
  26. ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
  27. new_all_elements = []
  28. for element in all_elements:
  29. later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
  30. abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
  31. abstand_y = abs(float(element[-1][3])-float(element[0][1]))
  32. if later_bigger >= -5:
  33. #print(abstand_x-abstand_y)
  34. new_all_elements.append(element)
  35. else:
  36. new_element = sorted(element, key=lambda k: [float(k[0])])
  37. new_all_elements.append(new_element)
  38. """for element in new_all_elements:
  39. for blub in element:
  40. #print(blub[4])
  41. #print("\n")"""
  42. return new_all_elements
  43. def pdf_to_html(uuid,filepath):
  44. filename = str(uuid)+"out.html"
  45. subprocess.call(['pdftotext', '-bbox-layout',
  46. filepath, filename])
  47. return filename
  48. def extract_isos(result):
  49. reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
  50. details_ = []
  51. for element in result:
  52. new_arr = ""
  53. #print(element)
  54. for x in element:
  55. new_arr += x[4] + " "
  56. #print(new_arr)
  57. if re.search(reg,new_arr):
  58. #print(new_arr)
  59. found = re.findall(reg, new_arr)
  60. for f in found:
  61. if len(f[0]) != 0:
  62. details_.append(f[0].replace(")",""))
  63. if len(f[1]) != 0:
  64. details_.append(f[1])
  65. return details_
  66. #file="/home/bscheibel/PycharmProjects/dxf_reader/drawings/5152166_Rev04.html"
  67. #get_bound_box(file)
  68. #pdf_to_html("/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.pdf")