main.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import order_bounding_boxes_in_each_block
  2. import clustering_precomputed_dbscan as dbscan
  3. import read_from_clustered_merged
  4. import organize_drawing_according_to_details_new
  5. import redis
  6. import json
  7. import sys
  8. import csv
  9. import numpy as np
  10. config_path = "/home/bscheibel/technical_drawings_extraction"
  11. def distance_knn(dm):
  12. knn = []
  13. for row in dm:
  14. row = row[row != 0]
  15. row = row.round(decimals=2)
  16. row = sorted(row)
  17. knn.extend(row[:2])
  18. return knn
  19. def distance_btw_blocks(result, path):
  20. result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None)
  21. with open(path+"/temporary/blub_distances.csv") as csvfile:
  22. read_csv = csv.reader(csvfile, delimiter=';')
  23. result = list(read_csv)
  24. dm = np.asarray([[dbscan.dist(p1, p2) for p2 in result] for p1 in result])
  25. return dm
  26. def get_min_nn(result, path):
  27. dm = distance_btw_blocks(result, path)
  28. knn = distance_knn(dm)
  29. knn = list(set(knn))
  30. knn = sorted(knn)
  31. return knn
  32. def find_nearest_above(my_array, target):
  33. diff = my_array - target
  34. mask = np.ma.less_equal(diff, 0)
  35. if np.all(mask):
  36. return None
  37. masked_diff = np.ma.masked_array(diff, mask)
  38. return masked_diff.argmin()
  39. def write_redis(uuid, result, db_params):
  40. db_params = redis.Redis(db_params)
  41. #db = db = redis.Redis(unix_socket_path='/tmp/redis.sock',db=7)
  42. print(db_params)
  43. db_params.set(uuid, result)
  44. def main(uuid, filepath, db, eps_manual):
  45. path = config_path
  46. filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
  47. result, number_blocks, number_words= order_bounding_boxes_in_each_block.get_bound_box(filename) ##get coordinates+text out of html file into array of arrays
  48. isos, general_tol = order_bounding_boxes_in_each_block.extract_isos(result)
  49. result_df = dbscan.get_average_xy(result, path)
  50. knn = get_min_nn(result_df, path)
  51. eps = min(knn)
  52. res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
  53. stopping_criterion = False
  54. while not stopping_criterion:
  55. print("cluster, eps: ", eps)
  56. silhoutte_old = silhoutte
  57. res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
  58. read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  59. old_eps = eps
  60. if not silhoutte >= silhoutte_old:
  61. print("stopping threshold reached")
  62. stopping_criterion = True
  63. try:
  64. eps = find_nearest_above(knn, eps)
  65. eps = knn[eps]
  66. except:
  67. print("highest nn value reached")
  68. break
  69. res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, old_eps, path)
  70. clean_arrays = read_from_clustered_merged.read(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  71. tables = order_bounding_boxes_in_each_block.get_tables(clean_arrays)
  72. pretty = read_from_clustered_merged.print_clean(clean_arrays)
  73. res, details_dict = organize_drawing_according_to_details_new.main_function(pretty, tables)
  74. json_isos = json.dumps(isos)
  75. json_result = json.dumps(res)
  76. json_details = json.dumps(details_dict)
  77. write_redis(uuid+"tol", general_tol, db)
  78. write_redis(uuid+"dims", json_result, db)
  79. write_redis(uuid+"isos",json_isos, db)
  80. write_redis(uuid+"eps", str(number_blocks)+"," + str(number_words), db)
  81. write_redis(uuid+"details", json_details, db)
  82. if __name__ == "__main__":
  83. uuid = sys.argv[1]
  84. filename = sys.argv[2]
  85. db = sys.argv[3]
  86. eps = sys.argv[4]
  87. main(uuid, filename, db, eps)