import order_bounding_boxes_in_each_block import clustering_precomputed_dbscan_noParallels as dbscan import read_from_clustered_merged import regex_clean_new import organize_drawing_according_to_details_new import json import redis import sys import get_distances #import algoritm_knn config_path = "/home/bscheibel/technical_drawings_extraction" def get_min_nn(result, path): dm = get_distances.distance_btw_blocks(result, path) knn = get_distances.distance_knn(dm) knn = list(set(knn)) knn = sorted(knn) return knn def find_nearest_above(my_array, target): diff = my_array - target mask = np.ma.less_equal(diff, 0) if np.all(mask): return None # returns None if target is greater than any value masked_diff = np.ma.masked_array(diff, mask) return masked_diff.argmin() def write_redis(uuid, result, db_params): db = redis.Redis(db_params) #db = db = redis.Redis(unix_socket_path='/tmp/redis.sock',db=7) print(db_params) db.set(uuid, result) def main(uuid, filepath, db, eps): print("TEEEEST") print(filepath) path = config_path filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path) result, number_blocks, number_words= order_bounding_boxes_in_each_block.get_bound_box(filename) ##get coordinates+text out of html file into array of arrays isos, general_tol = order_bounding_boxes_in_each_block.extract_isos(result) result_df = get_distances.get_average_xy(result, path) knn = get_min_nn(result_df, path) eps = min(knn) res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path) stopping_criterion = False while(not stopping_criterion): # this condition has to be changed to the breaking condition print("cluster, eps: ", eps) silhoutte_old = silhoutte res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path) read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv") old_eps = eps if (not silhoutte >= silhoutte_old): #and avg_words_block_new > avg_words_block: print("stopping threshold reached") stopping_criterion = True try: eps = find_nearest_above(knn, eps) eps = knn[eps] except: print("highest nn value reached") break res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, eps, path) clean_arrays = read_from_clustered_merged.read(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv") tables = order_bounding_boxes_in_each_block.get_tables(clean_arrays) pretty = regex_clean_new.print_clean(clean_arrays) res, details_dict = organize_drawing_according_to_details_new.main_function(pretty, tables) json_isos = json.dumps(isos) json_result = json.dumps(res) json_details =json.dumps(details_dict) write_redis(uuid+"tol", general_tol,db) write_redis(uuid+"dims", json_result, db) write_redis(uuid+"isos",json_isos, db) write_redis(uuid+"eps", str(number_blocks)+","+str(number_words), db) write_redis(uuid+"details",json_details ,db) if __name__ == "__main__": uuid = sys.argv[1] filename = sys.argv[2] db = sys.argv[3] eps = sys.argv[4] main(uuid,filename, db, eps) #main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/5129275_Rev01-GV12.pdf", "localhost",3)