main.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import order_bounding_boxes_in_each_block
  2. import clustering_precomputed_dbscan_noParallels as dbscan
  3. import read_from_clustered_merged
  4. import regex_clean_new
  5. import organize_drawing_according_to_details_new
  6. import json
  7. import redis
  8. import sys
  9. import get_distances
  10. #import algoritm_knn
  11. config_path = "/home/bscheibel/technical_drawings_extraction"
  12. def get_min_nn(result, path):
  13. dm = get_distances.distance_btw_blocks(result, path)
  14. knn = get_distances.distance_knn(dm)
  15. knn = list(set(knn))
  16. knn = sorted(knn)
  17. return knn
  18. def find_nearest_above(my_array, target):
  19. diff = my_array - target
  20. mask = np.ma.less_equal(diff, 0)
  21. if np.all(mask):
  22. return None # returns None if target is greater than any value
  23. masked_diff = np.ma.masked_array(diff, mask)
  24. return masked_diff.argmin()
  25. def write_redis(uuid, result, db_params):
  26. db = redis.Redis(db_params)
  27. #db = db = redis.Redis(unix_socket_path='/tmp/redis.sock',db=7)
  28. print(db_params)
  29. db.set(uuid, result)
  30. def main(uuid, filepath, db, eps):
  31. print("TEEEEST")
  32. print(filepath)
  33. path = config_path
  34. filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
  35. result, number_blocks, number_words= order_bounding_boxes_in_each_block.get_bound_box(filename) ##get coordinates+text out of html file into array of arrays
  36. isos, general_tol = order_bounding_boxes_in_each_block.extract_isos(result)
  37. result_df = get_distances.get_average_xy(result, path)
  38. knn = get_min_nn(result_df, path)
  39. eps = min(knn)
  40. res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
  41. stopping_criterion = False
  42. while(not stopping_criterion): # this condition has to be changed to the breaking condition
  43. print("cluster, eps: ", eps)
  44. silhoutte_old = silhoutte
  45. res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
  46. read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  47. old_eps = eps
  48. if (not silhoutte >= silhoutte_old): #and avg_words_block_new > avg_words_block:
  49. print("stopping threshold reached")
  50. stopping_criterion = True
  51. try:
  52. eps = find_nearest_above(knn, eps)
  53. eps = knn[eps]
  54. except:
  55. print("highest nn value reached")
  56. break
  57. res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, eps, path)
  58. clean_arrays = read_from_clustered_merged.read(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  59. tables = order_bounding_boxes_in_each_block.get_tables(clean_arrays)
  60. pretty = regex_clean_new.print_clean(clean_arrays)
  61. res, details_dict = organize_drawing_according_to_details_new.main_function(pretty, tables)
  62. json_isos = json.dumps(isos)
  63. json_result = json.dumps(res)
  64. json_details =json.dumps(details_dict)
  65. write_redis(uuid+"tol", general_tol,db)
  66. write_redis(uuid+"dims", json_result, db)
  67. write_redis(uuid+"isos",json_isos, db)
  68. write_redis(uuid+"eps", str(number_blocks)+","+str(number_words), db)
  69. write_redis(uuid+"details",json_details ,db)
  70. if __name__ == "__main__":
  71. uuid = sys.argv[1]
  72. filename = sys.argv[2]
  73. db = sys.argv[3]
  74. eps = sys.argv[4]
  75. main(uuid,filename, db, eps)
  76. #main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/5129275_Rev01-GV12.pdf", "localhost",3)