main.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import order_bounding_boxes_in_each_block
  2. import clustering_precomputed_dbscan as dbscan
  3. import read_from_clustered_merged
  4. import organize_drawing_according_to_details_new
  5. import redis
  6. import json
  7. import sys
  8. import csv
  9. import numpy as np
  10. with open('/home/bscheibel/PycharmProjects/clustering/config.txt', 'r') as file:
  11. config_path = file.read()
  12. print("Path: ", config_path)
  13. def distance_knn(dm):
  14. knn = []
  15. for row in dm:
  16. row = row[row != 0]
  17. row = row.round(decimals=2)
  18. row = sorted(row)
  19. knn.extend(row[:2])
  20. return knn
  21. def distance_btw_blocks(result, path):
  22. result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None)
  23. with open(path+"/temporary/blub_distances.csv") as csvfile:
  24. read_csv = csv.reader(csvfile, delimiter=';')
  25. result = list(read_csv)
  26. dm = np.asarray([[dbscan.dist(p1, p2) for p2 in result] for p1 in result])
  27. return dm
  28. def get_min_nn(result, path):
  29. dm = distance_btw_blocks(result, path)
  30. knn = distance_knn(dm)
  31. knn = list(set(knn))
  32. knn = sorted(knn)
  33. return knn
  34. def find_nearest_above(my_array, target):
  35. diff = my_array - target
  36. mask = np.ma.less_equal(diff, 0)
  37. if np.all(mask):
  38. return None
  39. masked_diff = np.ma.masked_array(diff, mask)
  40. return masked_diff.argmin()
  41. def write_redis(uuid, result, db_params):
  42. db_params = redis.Redis(db_params)
  43. #db = db = redis.Redis(unix_socket_path='/tmp/redis.sock',db=7)
  44. print(db_params)
  45. db_params.set(uuid, result)
  46. def main(uuid, filepath, db, eps_manual):
  47. path = config_path
  48. filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
  49. result, number_blocks, number_words= order_bounding_boxes_in_each_block.get_bound_box(filename) ##get coordinates+text out of html file into array of arrays
  50. isos, general_tol = order_bounding_boxes_in_each_block.extract_isos(result)
  51. result_df = dbscan.get_average_xy(result, path)
  52. knn = get_min_nn(result_df, path)
  53. eps = min(knn)
  54. res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
  55. stopping_criterion = False
  56. while not stopping_criterion:
  57. print("cluster, eps: ", eps)
  58. silhoutte_old = silhoutte
  59. res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
  60. read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  61. old_eps = eps
  62. if not silhoutte >= silhoutte_old:
  63. print("stopping threshold reached")
  64. stopping_criterion = True
  65. try:
  66. eps = find_nearest_above(knn, eps)
  67. eps = knn[eps]
  68. except:
  69. print("highest nn value reached")
  70. break
  71. res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, old_eps, path)
  72. clean_arrays = read_from_clustered_merged.read(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv")
  73. tables = order_bounding_boxes_in_each_block.get_tables(clean_arrays)
  74. pretty = read_from_clustered_merged.print_clean(clean_arrays)
  75. res, details_dict = organize_drawing_according_to_details_new.main_function(pretty, tables)
  76. json_isos = json.dumps(isos)
  77. json_result = json.dumps(res)
  78. json_details = json.dumps(details_dict)
  79. write_redis(uuid+"tol", general_tol, db)
  80. write_redis(uuid+"dims", json_result, db)
  81. write_redis(uuid+"isos",json_isos, db)
  82. write_redis(uuid+"eps", str(number_blocks)+"," + str(number_words), db)
  83. write_redis(uuid+"details", json_details, db)
  84. if __name__ == "__main__":
  85. uuid = sys.argv[1]
  86. filename = sys.argv[2]
  87. db = sys.argv[3]
  88. eps = sys.argv[4]
  89. main(uuid, filename, db, eps)