order_bounding_boxes_in_each_block.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. ### FIRST READ EACH BLOCK IN AN ARRAY
  2. from bs4 import BeautifulSoup
  3. import subprocess
  4. import redis
  5. import re
  6. import json
  7. def get_bound_box(uuid, file):
  8. print(file)
  9. response = open(file)
  10. html_doc = response.read()
  11. response.close()
  12. html_file = BeautifulSoup(html_doc, 'html.parser')
  13. all_elements = []
  14. blocks = html_file.findAll('block')
  15. for block in blocks:
  16. list_elements = []
  17. words = block.findAll('word')
  18. for word in words:
  19. word_list = []
  20. word_list.append(word["xmin"])
  21. word_list.append(word["ymin"])
  22. word_list.append(word["xmax"])
  23. word_list.append(word["ymax"])
  24. word_list.append(word.string)
  25. list_elements.append(word_list)
  26. all_elements.append(list_elements)
  27. #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
  28. #### FIRST TRYING XMIN und YMAX
  29. ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
  30. new_all_elements = []
  31. for element in all_elements:
  32. later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
  33. abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
  34. abstand_y = abs(float(element[-1][3])-float(element[0][1]))
  35. if later_bigger >= -5:
  36. #print(abstand_x-abstand_y)
  37. new_all_elements.append(element)
  38. else:
  39. new_element = sorted(element, key=lambda k: [float(k[0])])
  40. new_all_elements.append(new_element)
  41. """for element in new_all_elements:
  42. for blub in element:
  43. #print(blub[4])
  44. #print("\n")"""
  45. db = redis.Redis("localhost")
  46. db.set(uuid, "test")
  47. return new_all_elements, uuid
  48. def pdf_to_html(uuid,filepath):
  49. subprocess.call(['pdftotext', '-bbox-layout',
  50. filepath, str(uuid)+'out.html'])
  51. def extract_isos(result):
  52. reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
  53. details_ = []
  54. for element in result:
  55. new_arr = ""
  56. #print(element)
  57. for x in element:
  58. new_arr += x[4] + " "
  59. print(new_arr)
  60. if re.search(reg,new_arr):
  61. #print(new_arr)
  62. found = re.findall(reg, new_arr)
  63. for f in found:
  64. if len(f[0]) != 0:
  65. details_.append(f[0].replace(")",""))
  66. if len(f[1]) != 0:
  67. details_.append(f[1])
  68. return details_
  69. def main(uuid, result):
  70. pdf_to_html(uuid, result)
  71. res, uuid = get_bound_box(uuid, str(uuid)+"out.html")
  72. isos = extract_isos(res)
  73. isos_j = json.dumps(isos)
  74. db = redis.Redis("localhost")
  75. print(isos)
  76. db.set(uuid, str(isos_j))
  77. """file = "/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.html"
  78. res, uuid = get_bound_box("uuu", file)
  79. isos = extract_isos(res)
  80. print(isos)
  81. #pdf_to_html("/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.pdf")
  82. """
  83. #main("uud","/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.pdf")