5 年之前 · cf7c10567f
--- a/__pycache__/algoritm_knn.cpython-37.pyc
+++ b/__pycache__/algoritm_knn.cpython-37.pyc
--- a/__pycache__/clustering_precomputed_dbscan.cpython-37.pyc
+++ b/__pycache__/clustering_precomputed_dbscan.cpython-37.pyc
--- a/__pycache__/clustering_precomputed_dbscan_noParallels.cpython-37.pyc
+++ b/__pycache__/clustering_precomputed_dbscan_noParallels.cpython-37.pyc
--- a/__pycache__/csv_to_text.cpython-37.pyc
+++ b/__pycache__/csv_to_text.cpython-37.pyc
--- a/__pycache__/get_distances.cpython-37.pyc
+++ b/__pycache__/get_distances.cpython-37.pyc
--- a/__pycache__/merge_pandas.cpython-37.pyc
+++ b/__pycache__/merge_pandas.cpython-37.pyc
--- a/__pycache__/order_bounding_boxes_in_each_block.cpython-37.pyc
+++ b/__pycache__/order_bounding_boxes_in_each_block.cpython-37.pyc
--- a/__pycache__/organize_drawing_according_to_details_new.cpython-37.pyc
+++ b/__pycache__/organize_drawing_according_to_details_new.cpython-37.pyc
--- a/__pycache__/read_from_clustered_merged.cpython-37.pyc
+++ b/__pycache__/read_from_clustered_merged.cpython-37.pyc
--- a/__pycache__/regex_clean_new.cpython-37.pyc
+++ b/__pycache__/regex_clean_new.cpython-37.pyc
--- a/algoritm_knn.py
+++ b/algoritm_knn.py
@@ -1,171 +0,0 @@
 
				-import subprocess
			
 
				-import PyPDF2
			
 
				-import numpy as np
			
 
				-#import csv
			
 
				-import order_bounding_boxes_in_each_block
			
 
				-import read_from_clustered_merged
			
 
				-import get_distances
			
 
				-import clustering_precomputed_dbscan_noParallels as dbscan
			
 
				-
			
 
				-with open('/home/bscheibel/PycharmProjects/clustering/config.txt', 'r') as myfile:
			
 
				-    config_path = myfile.read()
			
 
				-    print("Path: ", config_path)
			
 
				-
			
 
				-def calculate_inner_distance(result):
			
 
				-    min_size_x,max_size_x, min_size_y, max_size_y, diagonal = get_distances.size_blocks(result)
			
 
				-    #print("inner distance: ", diagonal)
			
 
				-
			
 
				-def find_nearest_above(my_array, target):
			
 
				-    diff = my_array - target
			
 
				-    mask = np.ma.less_equal(diff, 0)
			
 
				-    # We need to mask the negative differences and zero
			
 
				-    # since we are looking for values above
			
 
				-    if np.all(mask):
			
 
				-        return None # returns None if target is greater than any value
			
 
				-    masked_diff = np.ma.masked_array(diff, mask)
			
 
				-    return masked_diff.argmin()
			
 
				-
			
 
				-def avg_words_block_clustered(words, cluster):
			
 
				-    blocks = cluster
			
 
				-    avg_words = words/blocks
			
 
				-    return avg_words
			
 
				-
			
 
				-def convert_pdf_img(filename):
			
 
				-    subprocess.call(['pdftoppm', '-jpeg', '-singlefile',
			
 
				-                     filename, config_path + '/temporary/out'])
			
 
				-
			
 
				-def read_pdf(filename):
			
 
				-    pdf = PyPDF2.PdfFileReader(filename, strict=False)
			
 
				-    p = pdf.getPage(0)
			
 
				-    w = p.mediaBox.getWidth()
			
 
				-    h = p.mediaBox.getHeight()
			
 
				-    orientation = p.get('/Rotate')
			
 
				-    return w, h, orientation
			
 
				-
			
 
				-def read_webpage(filename):
			
 
				-    return "test"
			
 
				-
			
 
				-
			
 
				-def get_min_nn(result, path):
			
 
				-    dm = get_distances.distance_btw_blocks(result, path)
			
 
				-    knn = get_distances.distance_knn(dm)
			
 
				-    knn = list(set(knn))
			
 
				-    knn = sorted(knn)
			
 
				-    return knn
			
 
				-
			
 
				-def show_boxes(filepath, clean_arrays, eps):
			
 
				-    img_path = config_path + '/temporary/out.jpg'
			
 
				-    w, h, orientation = read_pdf(filepath)
			
 
				-    convert_pdf_img(filepath)
			
 
				-    filename = filepath.split("/")[-1]
			
 
				-    filename = filename.split(".pdf")[0]
			
 
				-    read_from_clustered_merged.highlight_image(clean_arrays, img_path, w, h, orientation, eps, filename)
			
 
				-    return filename
			
 
				-
			
 
				-def main(uuid=123, filepath=config_path+"/"+ "drawings/Stahl_Adapterplatte.PDF"):
			
 
				-    path = config_path
			
 
				-    filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
			
 
				-    result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename)  #get coordinates+text out of html file into array of arrays
			
 
				-    print("number_blocks:", number_blocks)
			
 
				-    print("number_words:", number_words)
			
 
				-    avg_words_block = number_words / number_blocks
			
 
				-    print("avg words/blocks", avg_words_block)
			
 
				-    result_df = get_distances.get_average_xy(result, path)
			
 
				-
			
 
				-    # get min_nn
			
 
				-    knn = get_min_nn(result_df, path)
			
 
				-    #print("knn: ", knn)
			
 
				-    eps = min(knn)
			
 
				-    print("min_knn: ", eps)
			
 
				-
			
 
				-
			
 
				-
			
 
				-    #try one clustering iteration, with eps=smallest value
			
 
				-    res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
			
 
				-    # res = res.drop(res.columns[[0, 1]], axis=1).to_csv("test.csv", header=False)
			
 
				-    # #res = res.reset_index().tolist()
			
 
				-    # with open(path+"/test.csv") as csvfile:
			
 
				-    #     readCSV = csv.reader(csvfile, delimiter=';')
			
 
				-    #     res = list(readCSV)
			
 
				-    # print(res)
			
 
				-    #get_distances.get_average_xy(res, path)
			
 
				-    res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
			
 
				-
			
 
				-    #read default value
			
 
				-    clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
			
 
				-    show_boxes(filepath, clean_arrays, "default")
			
 
				-
			
 
				-    #print(number_clusters)
			
 
				-
			
 
				-    clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				-
			
 
				-    # show results
			
 
				-    show_boxes(filepath, clean_arrays, eps)
			
 
				-    #look at stop criterion
			
 
				-    avg_words_block = avg_words_block_clustered(number_words, number_clusters)
			
 
				-    print("a/w first clustering eps=1: ", avg_words_block)
			
 
				-    #cluster as long as stop criterion is not met
			
 
				-    print("cluster, eps: ", eps)
			
 
				-    chs = chs_old
			
 
				-
			
 
				-    while(1):    # this condition has to be changed to the breaking condition
			
 
				-
			
 
				-        print("cluster, eps: ", eps)
			
 
				-        dbs_old = dbs
			
 
				-        chs_old = chs
			
 
				-        silhoutte_old = silhoutte
			
 
				-        res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, eps, path)
			
 
				-
			
 
				-        # stop criterion, has to be established (silhoutte, davis-bouldin, c...?), or combination of these three
			
 
				-        avg_words_block_new = avg_words_block_clustered(number_words, number_clusters)
			
 
				-        print("avg_words_blocks:", avg_words_block_new)
			
 
				-        #stop_criterion = avg_words_block_new-avg_words_block
			
 
				-
			
 
				-        read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				-        # show results
			
 
				-        #show_boxes(filepath, clean_arrays, eps)
			
 
				-
			
 
				-        print(dbs <= dbs_old)
			
 
				-        print(chs >= chs_old)
			
 
				-        print(silhoutte >= silhoutte_old)
			
 
				-
			
 
				-        old_eps = eps
			
 
				-
			
 
				-        #block to see which conditions apply first, mostly silhoutte and dbs
			
 
				-        if not dbs<=dbs_old and avg_words_block_new>avg_words_block:
			
 
				-            print("stopping threshold reached dbs")
			
 
				-            clean_arrays = read_from_clustered_merged.read(
			
 
				-            path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				-            show_boxes(filepath, clean_arrays, "threshold dbs")
			
 
				-        if not chs>=chs_old and avg_words_block_new>avg_words_block:
			
 
				-            print("stopping threshold reached chs")
			
 
				-            clean_arrays = read_from_clustered_merged.read(
			
 
				-            path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				-            show_boxes(filepath, clean_arrays, "threshold chs")
			
 
				-        if not silhoutte>=silhoutte_old and avg_words_block_new > avg_words_block:
			
 
				-            print("stopping threshold reached silhoutte")
			
 
				-            clean_arrays = read_from_clustered_merged.read(
			
 
				-            path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				-            show_boxes(filepath, clean_arrays, "threshold silhoutte")
			
 
				-
			
 
				-        # and/or, or does not cluster too much, but can also be not enough
			
 
				-        if (not dbs <= dbs_old or not chs >= chs_old or not silhoutte >= silhoutte_old) and avg_words_block_new > avg_words_block:
			
 
				-            print("stopping threshold reached")
			
 
				-            break
			
 
				-        try:
			
 
				-            eps = find_nearest_above(knn, eps)
			
 
				-            eps = knn[eps]
			
 
				-        except:
			
 
				-            print("highest nn value reached")
			
 
				-            break
			
 
				-    res, number_clusters, dbs, chs, silhoutte = dbscan.clustering(dm, old_eps, path)
			
 
				-    print("Last EPS: ", old_eps)
			
 
				-    print("Last W/B: ", avg_words_block)
			
 
				-    clean_arrays = read_from_clustered_merged.read_default(path + "/temporary/list_to_csv_with_corner_points.csv")
			
 
				-    show_boxes(filepath, clean_arrays, "default")
			
 
				-    clean_arrays = read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				-    #show results
			
 
				-    show_boxes(filepath, clean_arrays, eps)
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    main()
			
--- a/clustering_precomputed_dbscan.py
+++ b/clustering_precomputed_dbscan.py
@@ -4,40 +4,32 @@ import pandas
 
				 import csv
			
 
				 from math import sqrt
			
 
				 from sklearn.cluster import DBSCAN
			
 
				+from sklearn import metrics
			
 
				+from sklearn.metrics import davies_bouldin_score
			
 
				+import time
			
 
				 
			
 
				 def get_average_xy(list_input, path):
			
 
				     csv_name = path+"/temporary/list_to_csv_with_corner_points.csv"
			
 
				-    resultFile = open(csv_name, 'w+')
			
 
				+    resultFile = open(csv_name, 'w')
			
 
				     wr = csv.writer(resultFile, delimiter=";")
			
 
				     wr.writerow(["element", "xmin","ymin","xmax","ymax", "ausrichtung","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
			
 
				     result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma","ausrichtung"])
			
 
				+
			
 
				     for element in list_input:
			
 
				-        xavg_elem = 0
			
 
				-        yavg_elem = 0
			
 
				         ymin = 100000000
			
 
				         ymax = 0
			
 
				         xmin = 100000000
			
 
				         xmax = 0
			
 
				         newList = []
			
 
				-        check = False
			
 
				         if len(element) == 5 and not isinstance(element[0], list):
			
 
				             newList.append(element)
			
 
				             element = newList
			
 
				-        """if len(element) != 5 and isinstance(element[0], list):
			
 
				-            for el in element:
			
 
				-                check = isinstance(el[0], list)
			
 
				-                if len(el) != 5:
			
 
				-                    print(el)
			
 
				-                #if check:
			
 
				-                #    print(el)"""
			
 
				-
			
 
				         for blub in element: #get the smallest and largest x and y value for whole block
			
 
				 
			
 
				-            if isinstance(blub[0],list) and len(blub[0])==5:
			
 
				+            if isinstance(blub[0],list) and len(blub[0]) == 5:
			
 
				                 blub = blub [0]
			
 
				             if float(blub[1]) < ymin:
			
 
				                 ymin = float(blub[1])
			
 
				-                #print("y_min:",y_min)
			
 
				             if float(blub[0]) < xmin:
			
 
				                 xmin = float(blub[0])
			
 
				             if float(blub[3]) > ymax:
			
@@ -45,12 +37,12 @@ def get_average_xy(list_input, path):
 
				             if float(blub[2]) > xmax:
			
 
				                 xmax = float(blub[2])
			
 
				         if float(xmax)-float(xmin) > 1.3*(float(ymax)-float(ymin)):
			
 
				-            ausrichtung = 0 #horizontal
			
 
				-        if 1.5*(float(xmax)-float(xmin)) < float(ymax)-float(ymin):
			
 
				-            ausrichtung = 1 #vertikal
			
 
				+            ausrichtung = 0  # horizontal
			
 
				+        #elif
			
 
				+        elif 1.3*(float(xmax)-float(xmin)) < float(ymax)-float(ymin):
			
 
				+            ausrichtung = 1   # vertikal
			
 
				         else:
			
 
				-            ausrichtung = 3 #sonstiges
			
 
				-
			
 
				+            ausrichtung = 3   # sonstiges
			
 
				 
			
 
				         ##### GET CORNER POINTS
			
 
				         point_xmi_ymi = [xmin,ymin]
			
@@ -67,35 +59,70 @@ def intersects(rectangle1, rectangle2): #using the separating axis theorem, retu
 
				 
			
 
				     rect_1_min = eval(rectangle1[0])
			
 
				     rect_1_max = eval(rectangle1[3])
			
 
				-    rect1_bottom_left_x= rect_1_min[0]
			
 
				-    rect1_top_right_x=rect_1_max[0]
			
 
				-    rect1_bottom_left_y= rect_1_max[1]
			
 
				-    rect1_top_right_y= rect_1_min[1]
			
 
				+    rect1_bottom_left_x = rect_1_min[0]
			
 
				+    rect1_top_right_x = rect_1_max[0]
			
 
				+    rect1_bottom_left_y = rect_1_max[1]
			
 
				+    rect1_top_right_y = rect_1_min[1]
			
 
				 
			
 
				     rect_2_min = eval(rectangle2[0])
			
 
				     rect_2_max = eval(rectangle2[3])
			
 
				-    rect2_bottom_left_x= rect_2_min[0]
			
 
				-    rect2_top_right_x=rect_2_max[0]
			
 
				-    rect2_bottom_left_y= rect_2_max[1]
			
 
				-    rect2_top_right_y=rect_2_min[1]
			
 
				+    rect2_bottom_left_x = rect_2_min[0]
			
 
				+    rect2_top_right_x = rect_2_max[0]
			
 
				+    rect2_bottom_left_y = rect_2_max[1]
			
 
				+    rect2_top_right_y = rect_2_min[1]
			
 
				 
			
 
				     return not (rect1_top_right_x < rect2_bottom_left_x or rect1_bottom_left_x > rect2_top_right_x or rect1_top_right_y > rect2_bottom_left_y or rect1_bottom_left_y < rect2_top_right_y)
			
 
				 
			
 
				 
			
 
				+def get_ausrichtung(rectangle1,rectangle2):
			
 
				+    #check if rect 1 and rect 2 are above or beside, r,l, a,b
			
 
				+
			
 
				+    min_1 = eval(rectangle1[0])
			
 
				+    min_2 = eval(rectangle2[0])
			
 
				+    diff_y = min_1[1] - min_2[1]
			
 
				+    diff_x = min_1[0] - min_2[0]
			
 
				+    if diff_x < diff_y:
			
 
				+        ausrichtung = "above"
			
 
				+    else:
			
 
				+        ausrichtung = "side"
			
 
				+    return ausrichtung
			
 
				+
			
 
				+
			
 
				+def get_parallel(rectangle1, rectangle2):
			
 
				+    parallel = False
			
 
				+    ausrichtung_1 = eval(rectangle1[4])
			
 
				+    ausrichtung_2 = eval(rectangle2[4])
			
 
				+    if ausrichtung_1 == ausrichtung_2 and ausrichtung_1 == 0:
			
 
				+        ausrichtung = get_ausrichtung(rectangle1, rectangle2)
			
 
				+        if ausrichtung == "above":
			
 
				+            parallel = True
			
 
				+
			
 
				+    if ausrichtung_1 == ausrichtung_2 and ausrichtung_1 == 1:
			
 
				+        ausrichtung = get_ausrichtung(rectangle1, rectangle2)
			
 
				+        if ausrichtung == "side":
			
 
				+            parallel = True
			
 
				+    return parallel
			
 
				+
			
 
				+
			
 
				 def dist(rectangle1, rectangle2):
			
 
				  #get minimal distance between two rectangles
			
 
				     distance = 100000000
			
 
				+    second_dist = 100000
			
 
				     for point1 in rectangle1[:4]:
			
 
				         point1 = eval(point1)
			
 
				         for point2 in rectangle2[:4]:
			
 
				             point2 = eval(point2)
			
 
				-            dist = sqrt(((float(point2[0]) - float(point1[0])))**2 + ((float(point2[1]) - float(point1[1])))**2)
			
 
				+            dist = sqrt((float(point2[0]) - float(point1[0]))**2 + ((float(point2[1]) - float(point1[1])))**2)
			
 
				             if dist < distance:
			
 
				+                second_dist = distance
			
 
				                 distance = dist
			
 
				-        if rectangle1[4] != rectangle2[4]:
			
 
				-            distance = dist + 100
			
 
				+        if get_parallel(rectangle1,rectangle2):
			
 
				+            distance += 1000
			
 
				+            second_dist += 1000
			
 
				         if intersects(rectangle1, rectangle2):
			
 
				-            distance = 0
			
 
				+          distance = 0
			
 
				+          second_dist = 0
			
 
				+    distance = (distance+second_dist)/2
			
 
				     return distance
			
 
				 
			
 
				 def clustering(dm,eps,path):
			
@@ -106,17 +133,58 @@ def clustering(dm,eps,path):
 
				     print('Estimated number of clusters: %d' % n_clusters_)
			
 
				     data_df = pandas.read_csv(path +"/temporary/list_to_csv_with_corner_points.csv", sep=";")
			
 
				     data_df["cluster"] = labels
			
 
				-    data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index().to_csv(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
			
 
				-    return data_df
			
 
				+    try:
			
 
				+        dbs = davies_bouldin_score(dm, labels)
			
 
				+        #dbs = "1"
			
 
				+        chs = metrics.calinski_harabasz_score(dm, labels)
			
 
				+        #chs = 1
			
 
				+        silhoutte = metrics.silhouette_score(dm, labels, metric='precomputed')
			
 
				+        #silhoutte = 2
			
 
				+        print("DBscore: ", dbs)
			
 
				+        print("calsinski: ", chs)
			
 
				+        print("silhoutte: ", silhoutte)
			
 
				+
			
 
				+    except:
			
 
				+        dbs=1
			
 
				+        chs=1
			
 
				+        silhoutte=1
			
 
				+
			
 
				+    data_df["ausrichtung"] = 1
			
 
				+    data_df = data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index()
			
 
				+    data_df.to_csv(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
			
 
				+
			
 
				+    return data_df, n_clusters_, dbs, chs, silhoutte
			
 
				 
			
 
				 def cluster_and_preprocess(result,eps,path):
			
 
				+    start_time = time.time()
			
 
				     result = get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays
			
 
				+    end_time = time.time()
			
 
				+    time_taken_get_average = end_time - start_time
			
 
				+    print("time get average: ", time_taken_get_average)
			
 
				+
			
 
				+    start_time = time.time()
			
 
				     result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
			
 
				+    end_time = time.time()
			
 
				+    time_taken_tocsv = end_time - start_time
			
 
				+    print("time to csv:" , time_taken_tocsv)
			
 
				+
			
 
				     with open(path+"/temporary/blub.csv") as csvfile:
			
 
				         readCSV = csv.reader(csvfile, delimiter=';')
			
 
				         result = list(readCSV)
			
 
				 
			
 
				+
			
 
				+    start_time = time.time()
			
 
				     dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result])
			
 
				-    clustering_result = clustering(dm,float(eps), path)
			
 
				-    return clustering_result
			
 
				+    end_time = time.time()
			
 
				+    time_taken_dm = end_time - start_time
			
 
				+    print("time dm:" , time_taken_dm)
			
 
				+
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+    clustering_result, n_clusters_, dbs, chs, silhoutte = clustering(dm,float(eps), path)
			
 
				+    end_time = time.time()
			
 
				+    time_taken_clustering = end_time - start_time
			
 
				+    print("time clustering:" , time_taken_clustering)
			
 
				+
			
 
				+    return clustering_result, n_clusters_, dbs, chs, silhoutte, dm
			
 
				 
			
--- a/clustering_precomputed_dbscan_noParallels.py
+++ b/clustering_precomputed_dbscan_noParallels.py
@@ -1,254 +0,0 @@
 
				-# coding: utf8
			
 
				-import numpy as np
			
 
				-import pandas
			
 
				-import csv
			
 
				-from math import sqrt
			
 
				-from sklearn.cluster import DBSCAN
			
 
				-
			
 
				-from sklearn import metrics
			
 
				-from sklearn.metrics import davies_bouldin_score
			
 
				-import time
			
 
				-
			
 
				-def get_average_xy(list_input, path):
			
 
				-    csv_name = path+"/temporary/list_to_csv_with_corner_points.csv"
			
 
				-    resultFile = open(csv_name, 'w')
			
 
				-    wr = csv.writer(resultFile, delimiter=";")
			
 
				-    wr.writerow(["element", "xmin","ymin","xmax","ymax", "ausrichtung","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
			
 
				-
			
 
				-    #result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
			
 
				-    result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma","ausrichtung"])
			
 
				-
			
 
				-    for element in list_input:
			
 
				-        xavg_elem = 0
			
 
				-        yavg_elem = 0
			
 
				-        ymin = 100000000
			
 
				-        ymax = 0
			
 
				-        xmin = 100000000
			
 
				-        xmax = 0
			
 
				-        newList = []
			
 
				-        check = False
			
 
				-        if len(element) == 5 and not isinstance(element[0], list):
			
 
				-            newList.append(element)
			
 
				-            element = newList
			
 
				-        """if len(element) != 5 and isinstance(element[0], list):
			
 
				-            for el in element:
			
 
				-                check = isinstance(el[0], list)
			
 
				-                if len(el) != 5:
			
 
				-                    print(el)
			
 
				-                #if check:
			
 
				-                #    print(el)"""
			
 
				-
			
 
				-        for blub in element: #get the smallest and largest x and y value for whole block
			
 
				-
			
 
				-            if isinstance(blub[0],list) and len(blub[0]) == 5:
			
 
				-                blub = blub [0]
			
 
				-            if float(blub[1]) < ymin:
			
 
				-                ymin = float(blub[1])
			
 
				-                #print("y_min:",y_min)
			
 
				-            if float(blub[0]) < xmin:
			
 
				-                xmin = float(blub[0])
			
 
				-            if float(blub[3]) > ymax:
			
 
				-                ymax = float(blub[3])
			
 
				-            if float(blub[2]) > xmax:
			
 
				-                xmax = float(blub[2])
			
 
				-        if float(xmax)-float(xmin) > 1.3*(float(ymax)-float(ymin)):
			
 
				-            ausrichtung = 0  # horizontal
			
 
				-        #elif
			
 
				-        elif 1.3*(float(xmax)-float(xmin)) < float(ymax)-float(ymin):
			
 
				-            ausrichtung = 1   # vertikal
			
 
				-        else:
			
 
				-            ausrichtung = 3   # sonstiges
			
 
				-
			
 
				-
			
 
				-        ##### GET CORNER POINTS
			
 
				-        point_xmi_ymi = [xmin,ymin]
			
 
				-        point_xma_ymi = [xmax,ymin]
			
 
				-        point_xmi_yma = [xmin,ymax]
			
 
				-        point_xma_yma = [xmax,ymax]
			
 
				-        wr.writerow([element,xmin,ymin,xmax,ymax, ausrichtung,point_xmi_ymi,point_xma_ymi,point_xmi_yma,point_xma_yma])
			
 
				-        result_df.loc[len(result_df)]=[point_xmi_ymi,point_xma_ymi, point_xmi_yma, point_xma_yma,ausrichtung]
			
 
				-        #wr.writerow([element, xmin,ymin,xmax,ymax])
			
 
				-        #result_df.loc[len(result_df)]=[xmin,xmax, xmin, ymax, ausrichtung]
			
 
				-
			
 
				-    resultFile.close()
			
 
				-    return result_df
			
 
				-
			
 
				-def intersects(rectangle1, rectangle2): #using the separating axis theorem, returns true if they intersect, otherwise false
			
 
				-
			
 
				-    rect_1_min = eval(rectangle1[0])
			
 
				-    rect_1_max = eval(rectangle1[3])
			
 
				-    rect1_bottom_left_x = rect_1_min[0]
			
 
				-    rect1_top_right_x = rect_1_max[0]
			
 
				-    rect1_bottom_left_y = rect_1_max[1]
			
 
				-    rect1_top_right_y = rect_1_min[1]
			
 
				-
			
 
				-    rect_2_min = eval(rectangle2[0])
			
 
				-    rect_2_max = eval(rectangle2[3])
			
 
				-    rect2_bottom_left_x = rect_2_min[0]
			
 
				-    rect2_top_right_x = rect_2_max[0]
			
 
				-    rect2_bottom_left_y = rect_2_max[1]
			
 
				-    rect2_top_right_y = rect_2_min[1]
			
 
				-
			
 
				-    return not (rect1_top_right_x < rect2_bottom_left_x or rect1_bottom_left_x > rect2_top_right_x or rect1_top_right_y > rect2_bottom_left_y or rect1_bottom_left_y < rect2_top_right_y)
			
 
				-
			
 
				-
			
 
				-def get_ausrichtung(rectangle1,rectangle2):
			
 
				-    #check if rect 1 and rect 2 are above or beside, r,l, a,b
			
 
				-
			
 
				-    min_1 = eval(rectangle1[0])
			
 
				-    #max_1 = eval(rectangle1[3])
			
 
				-    min_2 = eval(rectangle2[0])
			
 
				-    #max_2 = eval(rectangle2[3])
			
 
				-
			
 
				-    diff_y = min_1[1] - min_2[1] #
			
 
				-    diff_x = min_1[0] - min_2[0]
			
 
				-
			
 
				-    if diff_x < diff_y:
			
 
				-        ausrichtung = "above"
			
 
				-        #print(rectangle1, rectangle2, "above")
			
 
				-    else:
			
 
				-        ausrichtung = "side"
			
 
				-        #print(rectangle1,rectangle2, "side")
			
 
				-
			
 
				-    return ausrichtung
			
 
				-
			
 
				-
			
 
				-def get_parallel(rectangle1, rectangle2):
			
 
				-    #check if long sides are parallel, then we do not want to cluster these
			
 
				-    #check if x or y axis is longer, then get_ausrichtung
			
 
				-    parallel = False
			
 
				-    #x_longer_1 = False
			
 
				-    #x_longer_2 = False
			
 
				-    #print(rectangle1, rectangle1[0])
			
 
				-    min_1 = eval(rectangle1[0])
			
 
				-    max_1 = eval(rectangle1[3])
			
 
				-    min_2 = eval(rectangle2[0])
			
 
				-    max_2 = eval(rectangle2[3])
			
 
				-    ausrichtung_1 = eval(rectangle1[4])
			
 
				-    ausrichtung_2 = eval(rectangle2[4])
			
 
				-    x_axis_rect1 = float(max_1[0])-float(min_1[0])
			
 
				-    x_axis_rect2 = float(max_2[0])-float(min_2[0])
			
 
				-
			
 
				-    y_axis_rect1 = float(max_1[1])-float(min_1[1])
			
 
				-    y_axis_rect2 = float(max_2[1])-float(min_2[1])
			
 
				-
			
 
				-
			
 
				-
			
 
				-    if ausrichtung_1 == ausrichtung_2 and ausrichtung_1 == 0:
			
 
				-        ausrichtung = get_ausrichtung(rectangle1, rectangle2)
			
 
				-        if ausrichtung == "above":
			
 
				-            parallel = True
			
 
				-
			
 
				-    if ausrichtung_1 == ausrichtung_2 and ausrichtung_1 == 1:
			
 
				-        ausrichtung = get_ausrichtung(rectangle1, rectangle2)
			
 
				-        if ausrichtung == "side":
			
 
				-            parallel = True
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-    return parallel
			
 
				-
			
 
				-
			
 
				-def dist(rectangle1, rectangle2):
			
 
				- #get minimal distance between two rectangles
			
 
				-    distance = 100000000
			
 
				-    second_dist = 100000
			
 
				-    dist_x = 100000
			
 
				-    dist_y = 100000
			
 
				-    #print(rectangle1, rectangle2)
			
 
				-    #get_parallel(rectangle1, rectangle2)
			
 
				-    for point1 in rectangle1[:4]:
			
 
				-        point1 = eval(point1)
			
 
				-        for point2 in rectangle2[:4]:
			
 
				-            point2 = eval(point2)
			
 
				-            dist = sqrt((float(point2[0]) - float(point1[0]))**2 + ((float(point2[1]) - float(point1[1])))**2)
			
 
				-            if dist < distance:
			
 
				-                second_dist = distance
			
 
				-                distance = dist
			
 
				-                dist_x = float(point2[0]) - float(point1[0])
			
 
				-                #dist_y = (float(point2[1]) - float(point1[1]))
			
 
				-        if get_parallel(rectangle1,rectangle2):
			
 
				-            #print("parallel", rectangle2, rectangle1)
			
 
				-            distance += 1000
			
 
				-            second_dist += 1000
			
 
				-            #continue
			
 
				-        # if rectangle1[4] == rectangle2[4]:
			
 
				-        #     if rectangle1[4] == "0" and dist_x < 10:
			
 
				-        #           #print(rectangle1, rectangle2)
			
 
				-        #         distance = dist + 100
			
 
				-        #     elif rectangle1[4] == "1" and dist_y < 10:
			
 
				-        #         distance = dist + 100
			
 
				-        #          #print(rectangle1, rectangle2)
			
 
				-
			
 
				-        if intersects(rectangle1, rectangle2):
			
 
				-          #print(rectangle1, rectangle2, " intersect")
			
 
				-          distance = 0
			
 
				-          second_dist = 0
			
 
				-    #print("distanz_zwei: ", second_dist, "distanz: ", distance)
			
 
				-    distance = (distance+second_dist)/2
			
 
				-    return distance
			
 
				-
			
 
				-def clustering(dm,eps,path):
			
 
				-    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)
			
 
				-    labels = db.labels_
			
 
				-    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
			
 
				-
			
 
				-    print('Estimated number of clusters: %d' % n_clusters_)
			
 
				-    data_df = pandas.read_csv(path +"/temporary/list_to_csv_with_corner_points.csv", sep=";")
			
 
				-    data_df["cluster"] = labels
			
 
				-    try:
			
 
				-        dbs = davies_bouldin_score(dm, labels)
			
 
				-        #dbs = "1"
			
 
				-        chs = metrics.calinski_harabasz_score(dm, labels)
			
 
				-        #chs = 1
			
 
				-        silhoutte = metrics.silhouette_score(dm, labels, metric='precomputed')
			
 
				-        #silhoutte = 2
			
 
				-        print("DBscore: ", dbs)
			
 
				-        print("calsinski: ", chs)
			
 
				-        print("silhoutte: ", silhoutte)
			
 
				-
			
 
				-    except:
			
 
				-        dbs=1
			
 
				-        chs=1
			
 
				-        silhoutte=1
			
 
				-
			
 
				-    data_df["ausrichtung"] = 1
			
 
				-    data_df = data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index()
			
 
				-    data_df.to_csv(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
			
 
				-
			
 
				-    return data_df, n_clusters_, dbs, chs, silhoutte
			
 
				-
			
 
				-def cluster_and_preprocess(result,eps,path):
			
 
				-    start_time = time.time()
			
 
				-    result = get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays
			
 
				-    end_time = time.time()
			
 
				-    time_taken_get_average = end_time - start_time
			
 
				-    print("time get average: ", time_taken_get_average)
			
 
				-
			
 
				-    start_time = time.time()
			
 
				-    result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
			
 
				-    end_time = time.time()
			
 
				-    time_taken_tocsv = end_time - start_time
			
 
				-    print("time to csv:" , time_taken_tocsv)
			
 
				-
			
 
				-    with open(path+"/temporary/blub.csv") as csvfile:
			
 
				-        readCSV = csv.reader(csvfile, delimiter=';')
			
 
				-        result = list(readCSV)
			
 
				-
			
 
				-
			
 
				-    start_time = time.time()
			
 
				-    dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result])
			
 
				-    end_time = time.time()
			
 
				-    time_taken_dm = end_time - start_time
			
 
				-    print("time dm:" , time_taken_dm)
			
 
				-
			
 
				-
			
 
				-    start_time = time.time()
			
 
				-    clustering_result, n_clusters_, dbs, chs, silhoutte = clustering(dm,float(eps), path)
			
 
				-    end_time = time.time()
			
 
				-    time_taken_clustering = end_time - start_time
			
 
				-    print("time clustering:" , time_taken_clustering)
			
 
				-
			
 
				-    return clustering_result, n_clusters_, dbs, chs, silhoutte, dm
			
 
				-
			
--- a/config.txt
+++ b/config.txt
@@ -1 +1 @@
 
				-/home/bscheibel/technical_drawings_extraction
			
 
				+/home/bscheibel/technical_drawings_extraction
			
--- a/get_distances.py
+++ b/get_distances.py
@@ -1,227 +0,0 @@
 
				-from math import sqrt
			
 
				-import numpy as np
			
 
				-import pandas
			
 
				-import csv
			
 
				-#import math
			
 
				-from clustering_precomputed_dbscan_noParallels import intersects
			
 
				-from scipy import stats
			
 
				-
			
 
				-
			
 
				-def get_average_xy(list_input, path):
			
 
				-    csv_name = path+"/temporary/list_to_csv_with_corner_points_distances.csv"
			
 
				-    resultFile = open(csv_name, 'w')
			
 
				-    wr = csv.writer(resultFile, delimiter=";")
			
 
				-    wr.writerow(["element","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
			
 
				-
			
 
				-    result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
			
 
				-    #result_df = pandas.DataFrame(columns=["xmin","ymin","xmax","ymax"])
			
 
				-
			
 
				-    for element in list_input:
			
 
				-        xavg_elem = 0
			
 
				-        yavg_elem = 0
			
 
				-        ymin = 100000000
			
 
				-        ymax = 0
			
 
				-        xmin = 100000000
			
 
				-        xmax = 0
			
 
				-        newList = []
			
 
				-        check = False
			
 
				-        if len(element) == 5 and not isinstance(element[0], list):
			
 
				-            newList.append(element)
			
 
				-            element = newList
			
 
				-
			
 
				-        for blub in element: #get the smallest and largest x and y value for whole block
			
 
				-
			
 
				-            if isinstance(blub[0],list) and len(blub[0])==5:
			
 
				-                blub = blub [0]
			
 
				-            if float(blub[1]) < ymin:
			
 
				-                ymin = float(blub[1])
			
 
				-                #print("y_min:",y_min)
			
 
				-            if float(blub[0]) < xmin:
			
 
				-                xmin = float(blub[0])
			
 
				-            if float(blub[3]) > ymax:
			
 
				-                ymax = float(blub[3])
			
 
				-            if float(blub[2]) > xmax:
			
 
				-                xmax = float(blub[2])
			
 
				-        point_xmi_ymi = [xmin,ymin]
			
 
				-        point_xma_ymi = [xmax,ymin]
			
 
				-        point_xmi_yma = [xmin,ymax]
			
 
				-        point_xma_yma = [xmax,ymax]
			
 
				-        wr.writerow([element, point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma])
			
 
				-        result_df.loc[len(result_df)] = [ point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma]
			
 
				-
			
 
				-    resultFile.close()
			
 
				-    #result_df.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
			
 
				-
			
 
				-    return result_df
			
 
				-
			
 
				-
			
 
				-def dist(rectangle1, rectangle2):
			
 
				- #get minimal distance between two rectangles
			
 
				-    distance = 100000000
			
 
				-    #print(rectangle2,rectangle1)
			
 
				-    for point1 in rectangle1[:4]:
			
 
				-        #print(point1)
			
 
				-        point1 = eval(point1) #necessary to convert [] to real tuple
			
 
				-        for point2 in rectangle2[:4]:
			
 
				-            #print(point2)
			
 
				-            point2 = eval(point2)
			
 
				-            dist = sqrt(((float(point2[0]) - float(point1[0])))**2 + ((float(point2[1]) - float(point1[1])))**2)
			
 
				-            if dist < distance:
			
 
				-                distance = dist
			
 
				-        if intersects(rectangle1, rectangle2):
			
 
				-            distance = 0
			
 
				-    return distance
			
 
				-
			
 
				-def size_blocks(list_input):  #x, y distance of blocks (not regarding words)
			
 
				-    #print(list_input)
			
 
				-    min_size_x = 1000000000
			
 
				-    max_size_x = 0
			
 
				-    min_size_y = 1000000000
			
 
				-    max_size_y = 0
			
 
				-    x_size = []
			
 
				-    y_size = []
			
 
				-    diagonal = []
			
 
				-    for element in list_input:
			
 
				-        newList = []
			
 
				-        ymin = 1000000000
			
 
				-        ymax = 0
			
 
				-        xmin = 1000000000
			
 
				-        xmax = 0
			
 
				-        if len(element) == 5 and not isinstance(element[0], list):
			
 
				-            newList.append(element)
			
 
				-            element = newList
			
 
				-
			
 
				-        for blub in element:  # get the smallest and largest x and y value for whole block, block sizes
			
 
				-            print(blub)
			
 
				-            if isinstance(blub[0], list) and len(blub[0]) == 5:
			
 
				-                blub = blub[0]
			
 
				-            if float(blub[1]) < ymin:
			
 
				-                ymin = float(blub[1])
			
 
				-                # print("y_min:",y_min)
			
 
				-            if float(blub[0]) < xmin:
			
 
				-                xmin = float(blub[0])
			
 
				-            if float(blub[3]) > ymax:
			
 
				-                ymax = float(blub[3])
			
 
				-            if float(blub[2]) > xmax:
			
 
				-                xmax = float(blub[2])
			
 
				-
			
 
				-        distance_x = xmax-xmin
			
 
				-        distance_y = ymax-ymin
			
 
				-        diagonal_ = math.sqrt(distance_x ** 2 + distance_y ** 2)  # satz der pythogoras
			
 
				-        diagonal.append(diagonal_)
			
 
				-        x_size.append(distance_x)
			
 
				-        #print(distance_x, blub[4])
			
 
				-        #print(distance_y, blub[4])
			
 
				-        y_size.append(distance_y)
			
 
				-        if distance_x < min_size_x:
			
 
				-            min_size_x = distance_x
			
 
				-        if distance_x > max_size_x:
			
 
				-            max_size_x = distance_x
			
 
				-        if distance_y < min_size_y:
			
 
				-            min_size_y = distance_y
			
 
				-        if distance_y > max_size_y:
			
 
				-            max_size_y = distance_y
			
 
				-
			
 
				-    x_size = np.array(x_size)
			
 
				-
			
 
				-    x_size = x_size.round(decimals=0)
			
 
				-    #print(x_size)
			
 
				-    median_size_x = np.median(x_size)
			
 
				-    modus_size_x = stats.mode(x_size)
			
 
				-
			
 
				-    y_size = np.array(y_size)
			
 
				-
			
 
				-    y_size = y_size.round(decimals=0)
			
 
				-    #print(y_size)
			
 
				-    median_size_y = np.median(y_size)
			
 
				-    modus_size_y = stats.mode(y_size)
			
 
				-
			
 
				-
			
 
				-
			
 
				-    #print(min_size_x,max_size_x, min_size_y, max_size_y)
			
 
				-    print("Size_Median_x:", median_size_x)
			
 
				-    print("Size_Median_y:", median_size_y)
			
 
				-    print("Size_Modus_x:", modus_size_x)
			
 
				-    print("Size_Modus_y:", modus_size_y)
			
 
				-    return min_size_x,max_size_x, min_size_y, max_size_y, diagonal
			
 
				-
			
 
				-
			
 
				-def distance_btw_blocks(result, path):
			
 
				-    result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None)
			
 
				-    with open(path+"/temporary/blub_distances.csv") as csvfile:
			
 
				-        readCSV = csv.reader(csvfile, delimiter=';')
			
 
				-        result = list(readCSV)
			
 
				-    dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result])
			
 
				-
			
 
				-    dm_flattened = dm.flatten()
			
 
				-    dm_flattened = dm_flattened[dm_flattened != 0]
			
 
				-    dm_flattened = dm_flattened.round(decimals=0)
			
 
				-    #dm_ordered = sorted(dm_flattened)
			
 
				-    #print(dm_ordered)
			
 
				-    median= np.median(dm_flattened)
			
 
				-
			
 
				-
			
 
				-
			
 
				-    mode = stats.mode(dm_flattened)
			
 
				-    most_often= pandas.value_counts(dm_flattened)
			
 
				-    #x = itemfreq(dm_flattened)
			
 
				-    #print(pandas.DataFrame(most_often, columns=["first"]).columns)
			
 
				-    #print(x)
			
 
				-    print("Distance Mode:", mode)
			
 
				-    #print(most_often)
			
 
				-    #largest = most_often.nsmallest(15, "first")
			
 
				-    #print(largest)
			
 
				-    print("Distance Median:", median)
			
 
				-    #print(max_dist, min_dist)
			
 
				-
			
 
				-    return dm
			
 
				-
			
 
				-def distance_knn(dm):
			
 
				-    knn = []
			
 
				-    for row in dm:
			
 
				-        row = row[row != 0]
			
 
				-        row = row.round(decimals=2)
			
 
				-        row = sorted(row)
			
 
				-        knn.extend(row[:2])
			
 
				-    return knn
			
 
				-
			
 
				-def avg_words_block_clustered(words, cluster):
			
 
				-    blocks = cluster
			
 
				-    avg_words = words/blocks
			
 
				-    #print(avg_words)
			
 
				-    return avg_words
			
 
				-
			
 
				-
			
 
				-# def main(uuid, filepath):
			
 
				-#     path = "/home/bscheibel/PycharmProjects/clustering"
			
 
				-#     filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
			
 
				-#     result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename)
			
 
				-#     print("number_blocks:", number_blocks)
			
 
				-#     print("number_words:", number_words)
			
 
				-#     print("avg words/blocks", number_words/number_blocks)
			
 
				-#     size_blocks(result)
			
 
				-#
			
 
				-#     result_df = get_average_xy(result, path)
			
 
				-#
			
 
				-#
			
 
				-#     result = clustering_precomputed_dbscan_og_without.get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays
			
 
				-#     #print(result)
			
 
				-#     result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
			
 
				-#     #with open(path+"/temporary/blub.csv") as csvfile:
			
 
				-#     #    readCSV = csv.reader(csvfile, delimiter=';')
			
 
				-#     #    result = list(readCSV)
			
 
				-#     dm = distance_btw_blocks(result_df, path)
			
 
				-#     knn = distance_knn(dm)
			
 
				-#     median= np.median(knn)
			
 
				-#     mode = stats.mode(knn)
			
 
				-#     min_nn = min(knn)
			
 
				-#     avg = np.sum(knn)/len(knn)
			
 
				-#     print("min_knn:", min_nn)
			
 
				-#     print("knn_mean:", avg)
			
 
				-#     print("knn_median:", median)
			
 
				-#     print("knn_mode:", mode)
			
 
				-
			
 
				-    #knn.nearestNeighbors(dm)
			
 
				-
			
 
				-
			
 
				-#main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/Werkstattzeichnung Zwischenwelle.pdf")
			
--- a/main.py
+++ b/main.py
@@ -1,51 +1,73 @@
 
				 import order_bounding_boxes_in_each_block
			
 
				-import clustering_precomputed_dbscan_noParallels as dbscan
			
 
				+import clustering_precomputed_dbscan as dbscan
			
 
				 import read_from_clustered_merged
			
 
				-import regex_clean_new
			
 
				 import organize_drawing_according_to_details_new
			
 
				-import json
			
 
				 import redis
			
 
				+import json
			
 
				 import sys
			
 
				-import get_distances
			
 
				-#import algoritm_knn
			
 
				+import csv
			
 
				+import numpy as np
			
 
				+
			
 
				+with open('/home/bscheibel/PycharmProjects/clustering/config.txt', 'r') as file:
			
 
				+    config_path = file.read()
			
 
				+    print("Path: ", config_path)
			
 
				+
			
 
				+
			
 
				+def distance_knn(dm):
			
 
				+    knn = []
			
 
				+    for row in dm:
			
 
				+        row = row[row != 0]
			
 
				+        row = row.round(decimals=2)
			
 
				+        row = sorted(row)
			
 
				+        knn.extend(row[:2])
			
 
				+    return knn
			
 
				+
			
 
				+
			
 
				+def distance_btw_blocks(result, path):
			
 
				+    result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None)
			
 
				+    with open(path+"/temporary/blub_distances.csv") as csvfile:
			
 
				+        read_csv = csv.reader(csvfile, delimiter=';')
			
 
				+        result = list(read_csv)
			
 
				+    dm = np.asarray([[dbscan.dist(p1, p2) for p2 in result] for p1 in result])
			
 
				+    return dm
			
 
				 
			
 
				-config_path = "/home/bscheibel/technical_drawings_extraction"
			
 
				 
			
 
				 def get_min_nn(result, path):
			
 
				-    dm = get_distances.distance_btw_blocks(result, path)
			
 
				-    knn = get_distances.distance_knn(dm)
			
 
				+    dm = distance_btw_blocks(result, path)
			
 
				+    knn = distance_knn(dm)
			
 
				     knn = list(set(knn))
			
 
				     knn = sorted(knn)
			
 
				     return knn
			
 
				 
			
 
				+
			
 
				 def find_nearest_above(my_array, target):
			
 
				     diff = my_array - target
			
 
				     mask = np.ma.less_equal(diff, 0)
			
 
				     if np.all(mask):
			
 
				-        return None # returns None if target is greater than any value
			
 
				+        return None
			
 
				     masked_diff = np.ma.masked_array(diff, mask)
			
 
				     return masked_diff.argmin()
			
 
				 
			
 
				+
			
 
				 def write_redis(uuid, result, db_params):
			
 
				-    db = redis.Redis(db_params)
			
 
				+    db_params = redis.Redis(db_params)
			
 
				     #db = db = redis.Redis(unix_socket_path='/tmp/redis.sock',db=7)
			
 
				     print(db_params)
			
 
				-    db.set(uuid, result)
			
 
				+    db_params.set(uuid, result)
			
 
				 
			
 
				-def main(uuid, filepath, db, eps):
			
 
				-    print("TEEEEST")
			
 
				-    print(filepath)
			
 
				+
			
 
				+def main(uuid, filepath, db, eps_manual):
			
 
				     path = config_path
			
 
				     filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
			
 
				     result, number_blocks, number_words= order_bounding_boxes_in_each_block.get_bound_box(filename)  ##get coordinates+text out of html file into array of arrays
			
 
				     isos, general_tol = order_bounding_boxes_in_each_block.extract_isos(result)
			
 
				-    result_df = get_distances.get_average_xy(result, path)
			
 
				+    result_df = dbscan.get_average_xy(result, path)
			
 
				     knn = get_min_nn(result_df, path)
			
 
				     eps = min(knn)
			
 
				     res, number_clusters, dbs, chs_old, silhoutte, dm = dbscan.cluster_and_preprocess(result, eps, path)
			
 
				     stopping_criterion = False
			
 
				 
			
 
				-    while(not stopping_criterion):    # this condition has to be changed to the breaking condition
			
 
				+    while not stopping_criterion:
			
 
				 
			
 
				         print("cluster, eps: ", eps)
			
 
				         silhoutte_old = silhoutte
			
@@ -53,10 +75,9 @@ def main(uuid, filepath, db, eps):
 
				 
			
 
				         read_from_clustered_merged.read(path + "/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				         old_eps = eps
			
 
				-
			
 
				-        if (not silhoutte >= silhoutte_old): #and avg_words_block_new > avg_words_block:
			
 
				-             print("stopping threshold reached")
			
 
				-             stopping_criterion = True
			
 
				+        if not silhoutte >= silhoutte_old:
			
 
				+            print("stopping threshold reached")
			
 
				+            stopping_criterion = True
			
 
				         try:
			
 
				             eps = find_nearest_above(knn, eps)
			
 
				             eps = knn[eps]
			
@@ -64,27 +85,25 @@ def main(uuid, filepath, db, eps):
 
				             print("highest nn value reached")
			
 
				             break
			
 
				 
			
 
				-    res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, eps, path)
			
 
				+    res, number_clusters, dbs, chs, silhouette = dbscan.clustering(dm, old_eps, path)
			
 
				     clean_arrays = read_from_clustered_merged.read(path+"/temporary/values_clusteredfrom_precomputed_dbscan.csv")
			
 
				     tables = order_bounding_boxes_in_each_block.get_tables(clean_arrays)
			
 
				-    pretty = regex_clean_new.print_clean(clean_arrays)
			
 
				+    pretty = read_from_clustered_merged.print_clean(clean_arrays)
			
 
				     res, details_dict = organize_drawing_according_to_details_new.main_function(pretty, tables)
			
 
				 
			
 
				     json_isos = json.dumps(isos)
			
 
				     json_result = json.dumps(res)
			
 
				-    json_details =json.dumps(details_dict)
			
 
				-    write_redis(uuid+"tol", general_tol,db)
			
 
				+    json_details = json.dumps(details_dict)
			
 
				+    write_redis(uuid+"tol", general_tol, db)
			
 
				     write_redis(uuid+"dims", json_result, db)
			
 
				     write_redis(uuid+"isos",json_isos, db)
			
 
				-    write_redis(uuid+"eps", str(number_blocks)+","+str(number_words), db)
			
 
				-    write_redis(uuid+"details",json_details ,db)
			
 
				+    write_redis(uuid+"eps", str(number_blocks)+"," + str(number_words), db)
			
 
				+    write_redis(uuid+"details", json_details, db)
			
 
				+
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     uuid = sys.argv[1]
			
 
				     filename = sys.argv[2]
			
 
				     db = sys.argv[3]
			
 
				     eps = sys.argv[4]
			
 
				-    main(uuid,filename, db, eps)
			
 
				-
			
 
				-
			
 
				-#main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/5129275_Rev01-GV12.pdf", "localhost",3)
			
 
				+    main(uuid, filename, db, eps)
			
--- a/order_bounding_boxes_in_each_block.py
+++ b/order_bounding_boxes_in_each_block.py
@@ -1,5 +1,3 @@
 
				-### FIRST READ EACH BLOCK IN AN ARRAY
			
 
				-
			
 
				 from bs4 import BeautifulSoup
			
 
				 import subprocess
			
 
				 import re
			
@@ -19,25 +17,13 @@ def get_bound_box(file):
 
				         words = block.findAll('word')
			
 
				         number_words += len(words)
			
 
				         for word in words:
			
 
				-            word_list = []
			
 
				-            word_list.append(word["xmin"])
			
 
				-            word_list.append(word["ymin"])
			
 
				-            word_list.append(word["xmax"])
			
 
				-            word_list.append(word["ymax"])
			
 
				-            word_list.append(word.string)
			
 
				+            word_list = [word["xmin"], word["ymin"], word["xmax"], word["ymax"], word.string]
			
 
				             list_elements.append(word_list)
			
 
				         all_elements.append(list_elements)
			
 
				-
			
 
				-
			
 
				-    #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
			
 
				-    #### FIRST TRYING XMIN und YMAX
			
 
				-    ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
			
 
				     new_all_elements = []
			
 
				 
			
 
				     for element in all_elements:
			
 
				         later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
			
 
				-        abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
			
 
				-        abstand_y = abs(float(element[-1][3])-float(element[0][1]))
			
 
				         if later_bigger >= -5:
			
 
				             new_all_elements.append(element)
			
 
				         else:
			
@@ -74,7 +60,6 @@ def extract_isos(result):
 
				 
			
 
				     return details_, str(general_tol)
			
 
				 
			
 
				-
			
 
				 def get_tables(result):
			
 
				     reg = r"(Start drawing)|(All dimensions)"
			
 
				     tables = []
			
--- a/organize_drawing_according_to_details_new.py
+++ b/organize_drawing_according_to_details_new.py
@@ -12,9 +12,6 @@ def get_details(result): #search for all details in drawing and store it in list
 
				     number = len(details)
			
 
				     return details, number
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				 def get_borders(details, tables):
			
 
				     sections = []
			
 
				     #print(coords)
			
@@ -76,9 +73,9 @@ def get_borders(details, tables):
 
				                 table_xmax = table[2]
			
 
				             table_ymin = table[1]
			
 
				             if y_max > table_ymin:
			
 
				-                if firstx_min > table_xmin and firstx_min < table_xmax:
			
 
				+                if table_xmin < firstx_min < table_xmax:
			
 
				                     y_max = table_ymin
			
 
				-                elif x_max > table_xmin and x_max < table_xmax:
			
 
				+                elif table_xmin < x_max < table_xmax:
			
 
				                     y_max = table_ymin
			
 
				 
			
 
				         sections.append((first,x_min, y_min,x_max,y_max))
			
@@ -133,7 +130,7 @@ def main_function(result, tables):
 
				             section.append(list(("No details",list((000.000,000.000,100000000.000,10000000.000)))))
			
 
				 
			
 
				 
			
 
				-    dict = {}
			
 
				+    dict_help = {}
			
 
				 
			
 
				     for res in result:
			
 
				         for det in section:
			
@@ -143,11 +140,11 @@ def main_function(result, tables):
 
				             if intersects(det,result[res]):
			
 
				                 name = det[0]
			
 
				                 help_dict[res] = result[res]
			
 
				-                if name in dict:
			
 
				-                    dict[name].update(help_dict)
			
 
				+                if name in dict_help:
			
 
				+                    dict_help[name].update(help_dict)
			
 
				                 else:
			
 
				-                    dict[name] = help_dict
			
 
				+                    dict_help[name] = help_dict
			
 
				                 break
			
 
				 
			
 
				-    return dict, details_dict
			
 
				+    return dict_help, details_dict
			
 
				 
			
--- a/read_from_clustered_merged.py
+++ b/read_from_clustered_merged.py
@@ -1,3 +1,5 @@
 
				+# coding=utf8
			
 
				+import re
			
 
				 import csv
			
 
				 
			
 
				 def read(file):
			
@@ -76,3 +78,55 @@ def read(file):
 
				             else:
			
 
				                 dict[element] = coords
			
 
				     return dict
			
 
				+
			
 
				+
			
 
				+def print_clean(dims): ##alles raus was nicht relevant ist! und zeichen ersetzen!
			
 
				+    dims_new = {}
			
 
				+    reg_clean = r"ISO|[a-zA-Z]{4,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$|^[A-Z]{1}$|^mm$|^\d{2}\.\d{2}\.\d{4}|^-$|A\d|^\d{1}$|^[A-Za-z]{3,}\.?$|^\d{5}|^\d{1}\s\W\s\d"
			
 
				+    for dim in dims:
			
 
				+        if re.search(reg_clean, dim):
			
 
				+            continue
			
 
				+        else:
			
 
				+            coords = dims[dim]
			
 
				+            if re.search(r"b\s\d*\W?\d*\s.",dim):
			
 
				+                dim = dim.replace('b', u"\u27C2")
			
 
				+            if re.search(r"g\s\d*\W?\d*", dim):
			
 
				+                dim = dim.replace('g', u"\u232D")
			
 
				+            if re.search(r"f\s\d*\W?\d*", dim):
			
 
				+                dim = dim.replace('f',  u"\u2225")
			
 
				+            if re.search(r"r\s\d*\W?\d*", dim):
			
 
				+                dim = dim.replace('r', u"\u25CE")
			
 
				+            if re.search(r"i\s\d*\W?\d*", dim):
			
 
				+                dim = dim.replace('i', u"\u232F")
			
 
				+            if re.search(r"j\s\d*\W?\d*", dim):
			
 
				+                dim = dim.replace('j', u"\u2316")
			
 
				+            if re.search(r"d\s\d*\W?\d*", dim):
			
 
				+                dim = dim.replace('d', u"\u2313")
			
 
				+            if re.search(r"c\s+\d*", dim):
			
 
				+                dim = dim.replace('c', u"\u23E5")
			
 
				+            if re.search(r"n\s+\d*", dim):
			
 
				+                dim = dim.replace('n', u"\u2300")
			
 
				+            if "È" in dim:
			
 
				+                dim = dim.replace('È', 'GG')
			
 
				+            if "`" in dim:
			
 
				+                dim = dim.replace('`', u"\u00B1")
			
 
				+            if "#" in dim:
			
 
				+                dim = dim.replace('#', "↔")
			
 
				+            if "⌀" in dim:
			
 
				+                dim = dim.replace('⌀', "Ø")
			
 
				+            reg12 = re.compile(r"(.*\d{1,4}\W?\d{0,4})\s?\+\s-\s?(\d{1,4}\W?\d{0,4})\s?(\d{1,4}\W?\d{0,3})") ##???? was machst du?? nach toleranzen suchen, mit +/- blabla
			
 
				+            reg13 = re.compile(r"(.*)\+\s\+\s(\d*\W\d*)\s(\d*\W\d*)(.*)")
			
 
				+            reg14 = re.compile(r"(\+\s?\d*,?.?\d*)\s*(\d*,?.?\d*)\s*(\+?\s?\-?\s?\d*,?.?\d*)")
			
 
				+            g = re.search(reg12, dim)
			
 
				+            f = re.search(reg13, dim)
			
 
				+            e = re.search(reg14, dim)
			
 
				+            if g:
			
 
				+                dim = re.sub(reg12, g.group(1) + " +" + g.group(2) + " -" + g.group(3), dim) # +/- toleranzen schön darstellen
			
 
				+            elif f:
			
 
				+                dim = f.group(1) + "+" + f.group(2) + " +" + f.group(3) + f.group(4)
			
 
				+            elif e:
			
 
				+                dim = e.group(2) + " " + e.group(1) + " " + e.group(3)
			
 
				+
			
 
				+            dim = dim.replace(" ,",".").replace(", ",".").replace(",",".")
			
 
				+            dims_new[dim] = coords
			
 
				+    return dims_new
			
--- a/regex_clean_new.py
+++ b/regex_clean_new.py
@@ -1,59 +0,0 @@
 
				-# coding=utf8
			
 
				-import re
			
 
				-
			
 
				-
			
 
				-def print_clean(dims): ##alles raus was nicht relevant ist! und zeichen ersetzen!
			
 
				-    dims_new = {}
			
 
				-    reg_clean = r"ISO|[a-zA-Z]{4,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$|^[A-Z]{1}$|^mm$|^\d{2}\.\d{2}\.\d{4}|^-$|A\d|^\d{1}$|^[A-Za-z]{3,}\.?$|^\d{5}|^\d{1}\s\W\s\d"
			
 
				-    for dim in dims:
			
 
				-        if re.search(reg_clean, dim):
			
 
				-            continue
			
 
				-        else:
			
 
				-            coords = dims[dim]
			
 
				-            if re.search(r"b\s\d*\W?\d*\s.",dim):
			
 
				-                dim = dim.replace('b', u"\u27C2")
			
 
				-            if re.search(r"g\s\d*\W?\d*", dim):
			
 
				-                dim = dim.replace('g', u"\u232D")
			
 
				-            if re.search(r"f\s\d*\W?\d*", dim):
			
 
				-                dim = dim.replace('f',  u"\u2225")
			
 
				-            if re.search(r"r\s\d*\W?\d*", dim):
			
 
				-                dim = dim.replace('r', u"\u25CE")
			
 
				-            if re.search(r"i\s\d*\W?\d*", dim):
			
 
				-                dim = dim.replace('i', u"\u232F")
			
 
				-            if re.search(r"j\s\d*\W?\d*", dim):
			
 
				-                dim = dim.replace('j', u"\u2316")
			
 
				-            if re.search(r"d\s\d*\W?\d*", dim):
			
 
				-                dim = dim.replace('d', u"\u2313")
			
 
				-            if re.search(r"c\s+\d*", dim):
			
 
				-                dim = dim.replace('c', u"\u23E5")
			
 
				-            if re.search(r"n\s+\d*", dim):
			
 
				-                dim = dim.replace('n', u"\u2300")
			
 
				-            if "È" in dim:
			
 
				-                dim = dim.replace('È', 'GG')
			
 
				-            if "`" in dim:
			
 
				-                dim = dim.replace('`', u"\u00B1")
			
 
				-            if "#" in dim:
			
 
				-                dim = dim.replace('#', "↔")
			
 
				-            if "⌀" in dim:
			
 
				-                dim = dim.replace('⌀', "Ø")
			
 
				-            reg12 = re.compile(r"(.*\d{1,4}\W?\d{0,4})\s?\+\s-\s?(\d{1,4}\W?\d{0,4})\s?(\d{1,4}\W?\d{0,3})") ##???? was machst du?? nach toleranzen suchen, mit +/- blabla
			
 
				-            reg13 = re.compile(r"(.*)\+\s\+\s(\d*\W\d*)\s(\d*\W\d*)(.*)")
			
 
				-            reg14 = re.compile(r"(\+\s?\d*,?.?\d*)\s*(\d*,?.?\d*)\s*(\+?\s?\-?\s?\d*,?.?\d*)")
			
 
				-            g = re.search(reg12, dim)
			
 
				-            f = re.search(reg13, dim)
			
 
				-            e = re.search(reg14, dim)
			
 
				-            if g:
			
 
				-                dim = re.sub(reg12, g.group(1) + " +" + g.group(2) + " -" + g.group(3), dim) # +/- toleranzen schön darstellen
			
 
				-                #print(dim)
			
 
				-            elif f:
			
 
				-                dim = f.group(1) + "+" + f.group(2) + " +" + f.group(3) + f.group(4)
			
 
				-            elif e:
			
 
				-                dim = e.group(2) + " " + e.group(1) + " " + e.group(3)
			
 
				-
			
 
				-            dim = dim.replace(" ,",".").replace(", ",".").replace(",",".")
			
 
				-            dims_new[dim] = coords
			
 
				-
			
 
				-    #for dim in dims_new:
			
 
				-    #    print(dim)
			
 
				-    #print(dims_new)
			
 
				-    return dims_new
			
--- a/temporary/33333out.html
+++ b/temporary/33333out.html
--- a/temporary/8409194out.html
+++ b/temporary/8409194out.html