bscheibel
/
technical_drawings_extraction


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
							import subprocess
import PyPDF2
import numpy as np
#import csv
import order_bounding_boxes_in_each_block
import read_from_clustered_merged
import get_distances
import clustering_precomputed_dbscan_noParallels as dbscan

with open('/home/bscheibel/PycharmProjects/clustering/config.txt', 'r') as myfile:
    config_path = myfile.read()
    print("Path: ", config_path)

def calculate_inner_distance(result):
    min_size_x,max_size_x, min_size_y, max_size_y, diagonal = get_distances.size_blocks(result)
    #print("inner distance: ", diagonal)

def find_nearest_above(my_array, target):
    diff = my_array - target
    mask = np.ma.less_equal(diff, 0)
    # We need to mask the negative differences and zero
    # since we are looking for values above
    if np.all(mask):
        return None # returns None if target is greater than any value
    masked_diff = np.ma.masked_array(diff, mask)
    return masked_diff.argmin()

def avg_words_block_clustered(words, cluster):
    blocks = cluster
    avg_words = words/blocks
    return avg_words

def convert_pdf_img(filename):
    subprocess.call(['pdftoppm', '-jpeg', '-singlefile',
                     filename, config_path + '/temporary/out'])

def read_pdf(filename):
    pdf = PyPDF2.PdfFileReader(filename, strict=False)
    p = pdf.getPage(0)
    w = p.mediaBox.getWidth()
    h = p.mediaBox.getHeight()
    orientation = p.get('/Rotate')
    return w, h, orientation

def read_webpage(filename):
    return "test"


def get_min_nn(result, path):
    dm = get_distances.distance_btw_blocks(result, path)
    knn = get_distances.distance_knn(dm)
    knn = list(set(knn))
    knn = sorted(knn)
    return knn

def show_boxes(filepath, clean_arrays, eps):
    img_path = config_path + '/temporary/out.jpg'
    w, h, orientation = read_pdf(filepath)
    convert_pdf_img(filepath)
    filename = filepath.split("/")[-1]
    filename = filename.split(".pdf")[0]
    read_from_clustered_merged.highlight_image(clean_arrays, img_path, w, h, orientation, eps, filename)
    return filename

def main(uuid=123, filepath=config_path+"/"+ "drawings/Stahl_Adapterplatte.PDF"):
    path = config_path
    filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
    result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename)  #get coordinates+text out of html file into array of arrays
    print("number_blocks:", number_blocks)
    print("number_words:", number_words)
    avg_words_block = number_words / number_blocks
    print("avg words/blocks", avg_words_block)
    result_df = get_distances.get_average_xy(result, path)

    # get min_nn
    knn = get_min_nn(result_df, path)
    #print("knn: ", knn)
    eps = min(knn)
    print("min_knn: ", eps)


    #try one clustering iteration, with eps=smallest value
    res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
    # res = res.drop(res.columns[[0, 1]], axis=1).to_csv("test.csv", header=False)
    # #res = res.reset_index().tolist()
    # with open(path+"/test.csv") as csvfile:
    #     readCSV = csv.reader(csvfile, delimiter=';')
    #     res = list(readCSV)
    # print(res)
    #get_distances.get_average_xy(res, path)
    res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)

    #read default value
    clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
    show_boxes(filepath, clean_arrays, "default")

    #print(number_clusters)

    clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")

    # show results
    show_boxes(filepath, clean_arrays, eps)
    #look at stop criterion
    avg_words_block = avg_words_block_clustered(number_words, number_clusters)
    print("a/w first clustering eps=1: ", avg_words_block)
    #cluster as long as stop criterion is not met
    print("cluster, eps: ", eps)
    chs = chs_old

    while(1):    # this condition has to be changed to the breaking condition

        print("cluster, eps: ", eps)
        dbs_old = dbs
        chs_old = chs
        silhoutte_old = silhoutte
        res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)

        # stop criterion, has to be established (silhoutte, davis-bouldin, c...?), or combination of these three
        avg_words_block_new = avg_words_block_clustered(number_words, number_clusters)
        print("avg_words_blocks:", avg_words_block_new)
        #stop_criterion = avg_words_block_new-avg_words_block

        read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
        # show results
        #show_boxes(filepath, clean_arrays, eps)

        print(dbs <= dbs_old)
        print(chs >= chs_old)
        print(silhoutte >= silhoutte_old)

        old_eps = eps

        #block to see which conditions apply first, mostly silhoutte and dbs
        if not dbs<=dbs_old and avg_words_block_new>avg_words_block:
            print("stopping threshold reached dbs")
            clean_arrays = read_from_clustered_merged.read(
            path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
            show_boxes(filepath, clean_arrays, "threshold dbs")
        if not chs>=chs_old and avg_words_block_new>avg_words_block:
            print("stopping threshold reached chs")
            clean_arrays = read_from_clustered_merged.read(
            path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
            show_boxes(filepath, clean_arrays, "threshold chs")
        if not silhoutte>=silhoutte_old and avg_words_block_new > avg_words_block:
            print("stopping threshold reached silhoutte")
            clean_arrays = read_from_clustered_merged.read(
            path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
            show_boxes(filepath, clean_arrays, "threshold silhoutte")

        # and/or, or does not cluster too much, but can also be not enough
        if (not dbs <= dbs_old or not chs >= chs_old or not silhoutte >= silhoutte_old) and avg_words_block_new > avg_words_block:
            print("stopping threshold reached")
            break
        try:
            eps = find_nearest_above(knn, eps)
            eps = knn[eps]
        except:
            print("highest nn value reached")
            break
    res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, old_eps, path)
    print("Last EPS: ", old_eps)
    print("Last W/B: ", avg_words_block)
    clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
    show_boxes(filepath, clean_arrays, "default")
    clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
    #show results
    show_boxes(filepath, clean_arrays, eps)

if __name__ == "__main__":
    main()