order_bounding_boxes_in_each_block.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. ### FIRST READ EACH BLOCK IN AN ARRAY
  2. from bs4 import BeautifulSoup
  3. def get_bound_box(file):
  4. response = open(file)
  5. html_doc = response.read()
  6. response.close()
  7. html_file = BeautifulSoup(html_doc, 'html.parser')
  8. all_elements = []
  9. blocks = html_file.findAll('block')
  10. for block in blocks:
  11. list_elements = []
  12. words = block.findAll('word')
  13. for word in words:
  14. word_list = []
  15. word_list.append(word["xmin"])
  16. word_list.append(word["ymin"])
  17. word_list.append(word["xmax"])
  18. word_list.append(word["ymax"])
  19. word_list.append(word.string)
  20. list_elements.append(word_list)
  21. all_elements.append(list_elements)
  22. #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
  23. #### FIRST TRYING XMIN und YMAX
  24. ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
  25. new_all_elements = []
  26. for element in all_elements:
  27. later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
  28. abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
  29. abstand_y = abs(float(element[-1][3])-float(element[0][1]))
  30. if later_bigger >= -5:
  31. #print(abstand_x-abstand_y)
  32. new_all_elements.append(element)
  33. else:
  34. new_element = sorted(element, key=lambda k: [float(k[0])])
  35. new_all_elements.append(new_element)
  36. """for element in new_all_elements:
  37. for blub in element:
  38. #print(blub[4])
  39. #print("\n")"""
  40. return new_all_elements
  41. #file="/home/bscheibel/PycharmProjects/dxf_reader/drawings/5152166_Rev04.html"
  42. #get_bound_box(file)