123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- import subprocess
- import PyPDF2
- import numpy as np
- #import csv
- import order_bounding_boxes_in_each_block
- import read_from_clustered_merged
- import get_distances
- from clustering_precomputed import clustering_precomputed_dbscan_noParallels as dbscan
- with open('/home/bscheibel/PycharmProjects/clustering/config.txt', 'r') as myfile:
- config_path = myfile.read()
- print("Path: ", config_path)
- def calculate_inner_distance(result):
- min_size_x,max_size_x, min_size_y, max_size_y, diagonal = get_distances.size_blocks(result)
- #print("inner distance: ", diagonal)
- def find_nearest_above(my_array, target):
- diff = my_array - target
- mask = np.ma.less_equal(diff, 0)
- # We need to mask the negative differences and zero
- # since we are looking for values above
- if np.all(mask):
- return None # returns None if target is greater than any value
- masked_diff = np.ma.masked_array(diff, mask)
- return masked_diff.argmin()
- def avg_words_block_clustered(words, cluster):
- blocks = cluster
- avg_words = words/blocks
- return avg_words
- def convert_pdf_img(filename):
- subprocess.call(['pdftoppm', '-jpeg', '-singlefile',
- filename, config_path + '/temporary/out'])
- def read_pdf(filename):
- pdf = PyPDF2.PdfFileReader(filename, strict=False)
- p = pdf.getPage(0)
- w = p.mediaBox.getWidth()
- h = p.mediaBox.getHeight()
- orientation = p.get('/Rotate')
- return w, h, orientation
- def read_webpage(filename):
- return "test"
- def get_min_nn(result, path):
- dm = get_distances.distance_btw_blocks(result, path)
- knn = get_distances.distance_knn(dm)
- knn = list(set(knn))
- knn = sorted(knn)
- return knn
- def show_boxes(filepath, clean_arrays, eps):
- img_path = config_path + '/temporary/out.jpg'
- w, h, orientation = read_pdf(filepath)
- convert_pdf_img(filepath)
- filename = filepath.split("/")[-1]
- filename = filename.split(".pdf")[0]
- read_from_clustered_merged.highlight_image(clean_arrays, img_path, w, h, orientation, eps, filename)
- return filename
- def main(uuid=123, filepath=config_path+"/"+ "drawings/Stahl_Adapterplatte.PDF"):
- path = config_path
- filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
- result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename) #get coordinates+text out of html file into array of arrays
- print("number_blocks:", number_blocks)
- print("number_words:", number_words)
- avg_words_block = number_words / number_blocks
- print("avg words/blocks", avg_words_block)
- result_df = get_distances.get_average_xy(result, path)
- # get min_nn
- knn = get_min_nn(result_df, path)
- #print("knn: ", knn)
- eps = min(knn)
- print("min_knn: ", eps)
- #try one clustering iteration, with eps=smallest value
- res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
- # res = res.drop(res.columns[[0, 1]], axis=1).to_csv("test.csv", header=False)
- # #res = res.reset_index().tolist()
- # with open(path+"/test.csv") as csvfile:
- # readCSV = csv.reader(csvfile, delimiter=';')
- # res = list(readCSV)
- # print(res)
- #get_distances.get_average_xy(res, path)
- res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
- #read default value
- clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
- show_boxes(filepath, clean_arrays, "default")
- #print(number_clusters)
- clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- # show results
- show_boxes(filepath, clean_arrays, eps)
- #look at stop criterion
- avg_words_block = avg_words_block_clustered(number_words, number_clusters)
- print("a/w first clustering eps=1: ", avg_words_block)
- #cluster as long as stop criterion is not met
- print("cluster, eps: ", eps)
- chs = chs_old
- while(1): # this condition has to be changed to the breaking condition
- print("cluster, eps: ", eps)
- dbs_old = dbs
- chs_old = chs
- silhoutte_old = silhoutte
- res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
- # stop criterion, has to be established (silhoutte, davis-bouldin, c...?), or combination of these three
- avg_words_block_new = avg_words_block_clustered(number_words, number_clusters)
- print("avg_words_blocks:", avg_words_block_new)
- #stop_criterion = avg_words_block_new-avg_words_block
- read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- # show results
- #show_boxes(filepath, clean_arrays, eps)
- print(dbs <= dbs_old)
- print(chs >= chs_old)
- print(silhoutte >= silhoutte_old)
- old_eps = eps
- #block to see which conditions apply first, mostly silhoutte and dbs
- if not dbs<=dbs_old and avg_words_block_new>avg_words_block:
- print("stopping threshold reached dbs")
- clean_arrays = read_from_clustered_merged.read(
- path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- show_boxes(filepath, clean_arrays, "threshold dbs")
- if not chs>=chs_old and avg_words_block_new>avg_words_block:
- print("stopping threshold reached chs")
- clean_arrays = read_from_clustered_merged.read(
- path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- show_boxes(filepath, clean_arrays, "threshold chs")
- if not silhoutte>=silhoutte_old and avg_words_block_new > avg_words_block:
- print("stopping threshold reached silhoutte")
- clean_arrays = read_from_clustered_merged.read(
- path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- show_boxes(filepath, clean_arrays, "threshold silhoutte")
- # and/or, or does not cluster too much, but can also be not enough
- if (not dbs <= dbs_old or not chs >= chs_old or not silhoutte >= silhoutte_old) and avg_words_block_new > avg_words_block:
- print("stopping threshold reached")
- break
- try:
- eps = find_nearest_above(knn, eps)
- eps = knn[eps]
- except:
- print("highest nn value reached")
- break
- res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, old_eps, path)
- print("Last EPS: ", old_eps)
- print("Last W/B: ", avg_words_block)
- clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
- show_boxes(filepath, clean_arrays, "default")
- clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
- #show results
- show_boxes(filepath, clean_arrays, eps)
- if __name__ == "__main__":
- main()
|