order_bounding_boxes_in_each_block.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. from bs4 import BeautifulSoup
  2. import subprocess
  3. import re
  4. def get_bound_box(file):
  5. response = open(file)
  6. html_doc = response.read()
  7. response.close()
  8. html_file = BeautifulSoup(html_doc, 'html.parser')
  9. all_elements = []
  10. blocks = html_file.findAll('block')
  11. number_blocks = len(blocks)
  12. number_words = 0
  13. for block in blocks:
  14. list_elements = []
  15. words = block.findAll('word')
  16. number_words += len(words)
  17. for word in words:
  18. word_list = [word["xmin"], word["ymin"], word["xmax"], word["ymax"], word.string]
  19. list_elements.append(word_list)
  20. all_elements.append(list_elements)
  21. new_all_elements = []
  22. for element in all_elements:
  23. later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
  24. if later_bigger >= -5:
  25. new_all_elements.append(element)
  26. else:
  27. new_element = sorted(element, key=lambda k: [float(k[0])])
  28. new_all_elements.append(new_element)
  29. return new_all_elements, number_blocks, number_words
  30. def pdf_to_html(uuid,filepath, path):
  31. filename = path +"/temporary/" +str(uuid)+"out.html"
  32. print(filename)
  33. subprocess.call(['pdftotext', '-bbox-layout',
  34. filepath, filename])
  35. print("test2")
  36. return filename
  37. def extract_isos(result):
  38. reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
  39. details_ = []
  40. reg_general = r"ISO\s?\d*\s*\W\s*[fmcv][HKL]"
  41. general_tol = ""
  42. for element in result:
  43. new_arr = ""
  44. for x in element:
  45. new_arr += x[4] + " "
  46. if re.search(reg,new_arr):
  47. found = re.findall(reg, new_arr)
  48. for f in found:
  49. if len(f[0]) != 0:
  50. details_.append(f[0].replace(")",""))
  51. if len(f[1]) != 0:
  52. details_.append(f[1])
  53. if re.search(reg_general, new_arr):
  54. general_tol = new_arr
  55. return details_, str(general_tol)
  56. def get_tables(result):
  57. reg = r"(Start drawing)|(All dimensions)"
  58. tables = []
  59. for element in result:
  60. new = []
  61. if re.search(reg, element):
  62. new.extend(result[element])
  63. new.append(element)
  64. tables.append(new)
  65. number = len(tables)
  66. return tables