12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- import order_bounding_boxes_in_each_block
- import clustering_precomputed_dbscan_noParallels as dbscan
- import read_from_clustered_merged
- import regex_clean_new
- import organize_drawing_according_to_details_new
- import json
- import redis
- import sys
- import get_distances
- #import algoritm_knn
- config_path = "/home/bscheibel/technical_drawings_extraction"
- def get_min_nn(result, path):
- dm = get_distances.distance_btw_blocks(result, path)
- knn = get_distances.distance_knn(dm)
- knn = list(set(knn))
- knn = sorted(knn)
- return knn
- def find_nearest_above(my_array, target):
- diff = my_array - target
- mask = np.ma.less_equal(diff, 0)
- if np.all(mask):
- return None # returns None if target is greater than any value
- masked_diff = np.ma.masked_array(diff, mask)
- return masked_diff.argmin()
- def write_redis(uuid, result, db_params):
- db = redis.Redis(db_params)
- #db = db = redis.Redis(unix_socket_path='/tmp/redis.sock',db=7)
- print(db_params)
- db.set(uuid, result)
- def main(uuid, filepath, db, eps):
- print("TEEEEST")
- print(filepath)
- path = config_path
- filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
- result, number_blocks, number_words= order_bounding_boxes_in_each_block.get_bound_box(filename) ##get coordinates+text out of html file into array of arrays
- isos, general_tol = order_bounding_boxes_in_each_block.extract_isos(result)
- result_df = get_distances.get_average_xy(result, path)
- knn = get_min_nn(result_df, path)
- eps = min(knn)
- res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
- stopping_criterion = False
- while(not stopping_criterion): # this condition has to be changed to the breaking condition
- print("cluster, eps: ", eps)
- silhoutte_old = silhoutte
- res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
- read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- old_eps = eps
- if (not silhoutte >= silhoutte_old): #and avg_words_block_new > avg_words_block:
- print("stopping threshold reached")
- stopping_criterion = True
- try:
- eps = find_nearest_above(knn, eps)
- eps = knn[eps]
- except:
- print("highest nn value reached")
- break
- res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, eps, path)
- clean_arrays = read_from_clustered_merged.read(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- tables = order_bounding_boxes_in_each_block.get_tables(clean_arrays)
- pretty = regex_clean_new.print_clean(clean_arrays)
- res, details_dict = organize_drawing_according_to_details_new.main_function(pretty, tables)
- json_isos = json.dumps(isos)
- json_result = json.dumps(res)
- json_details =json.dumps(details_dict)
- write_redis(uuid+"tol", general_tol,db)
- write_redis(uuid+"dims", json_result, db)
- write_redis(uuid+"isos",json_isos, db)
- write_redis(uuid+"eps", str(number_blocks)+","+str(number_words), db)
- write_redis(uuid+"details",json_details ,db)
- if __name__ == "__main__":
- uuid = sys.argv[1]
- filename = sys.argv[2]
- db = sys.argv[3]
- eps = sys.argv[4]
- main(uuid,filename, db, eps)
- #main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/5129275_Rev01-GV12.pdf", "localhost",3)
|