from math import sqrt import numpy as np import pandas import csv #import math from clustering_precomputed_dbscan_noParallels import intersects from scipy import stats def get_average_xy(list_input, path): csv_name = path+"/temporary/list_to_csv_with_corner_points_distances.csv" resultFile = open(csv_name, 'w') wr = csv.writer(resultFile, delimiter=";") wr.writerow(["element","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"]) result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"]) #result_df = pandas.DataFrame(columns=["xmin","ymin","xmax","ymax"]) for element in list_input: xavg_elem = 0 yavg_elem = 0 ymin = 100000000 ymax = 0 xmin = 100000000 xmax = 0 newList = [] check = False if len(element) == 5 and not isinstance(element[0], list): newList.append(element) element = newList for blub in element: #get the smallest and largest x and y value for whole block if isinstance(blub[0],list) and len(blub[0])==5: blub = blub [0] if float(blub[1]) < ymin: ymin = float(blub[1]) #print("y_min:",y_min) if float(blub[0]) < xmin: xmin = float(blub[0]) if float(blub[3]) > ymax: ymax = float(blub[3]) if float(blub[2]) > xmax: xmax = float(blub[2]) point_xmi_ymi = [xmin,ymin] point_xma_ymi = [xmax,ymin] point_xmi_yma = [xmin,ymax] point_xma_yma = [xmax,ymax] wr.writerow([element, point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma]) result_df.loc[len(result_df)] = [ point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma] resultFile.close() #result_df.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None) return result_df def dist(rectangle1, rectangle2): #get minimal distance between two rectangles distance = 100000000 #print(rectangle2,rectangle1) for point1 in rectangle1[:4]: #print(point1) point1 = eval(point1) #necessary to convert [] to real tuple for point2 in rectangle2[:4]: #print(point2) point2 = eval(point2) dist = sqrt(((float(point2[0]) - float(point1[0])))**2 + ((float(point2[1]) - float(point1[1])))**2) if dist < distance: distance = dist if intersects(rectangle1, rectangle2): distance = 0 return distance def size_blocks(list_input): #x, y distance of blocks (not regarding words) #print(list_input) min_size_x = 1000000000 max_size_x = 0 min_size_y = 1000000000 max_size_y = 0 x_size = [] y_size = [] diagonal = [] for element in list_input: newList = [] ymin = 1000000000 ymax = 0 xmin = 1000000000 xmax = 0 if len(element) == 5 and not isinstance(element[0], list): newList.append(element) element = newList for blub in element: # get the smallest and largest x and y value for whole block, block sizes print(blub) if isinstance(blub[0], list) and len(blub[0]) == 5: blub = blub[0] if float(blub[1]) < ymin: ymin = float(blub[1]) # print("y_min:",y_min) if float(blub[0]) < xmin: xmin = float(blub[0]) if float(blub[3]) > ymax: ymax = float(blub[3]) if float(blub[2]) > xmax: xmax = float(blub[2]) distance_x = xmax-xmin distance_y = ymax-ymin diagonal_ = math.sqrt(distance_x ** 2 + distance_y ** 2) # satz der pythogoras diagonal.append(diagonal_) x_size.append(distance_x) #print(distance_x, blub[4]) #print(distance_y, blub[4]) y_size.append(distance_y) if distance_x < min_size_x: min_size_x = distance_x if distance_x > max_size_x: max_size_x = distance_x if distance_y < min_size_y: min_size_y = distance_y if distance_y > max_size_y: max_size_y = distance_y x_size = np.array(x_size) x_size = x_size.round(decimals=0) #print(x_size) median_size_x = np.median(x_size) modus_size_x = stats.mode(x_size) y_size = np.array(y_size) y_size = y_size.round(decimals=0) #print(y_size) median_size_y = np.median(y_size) modus_size_y = stats.mode(y_size) #print(min_size_x,max_size_x, min_size_y, max_size_y) print("Size_Median_x:", median_size_x) print("Size_Median_y:", median_size_y) print("Size_Modus_x:", modus_size_x) print("Size_Modus_y:", modus_size_y) return min_size_x,max_size_x, min_size_y, max_size_y, diagonal def distance_btw_blocks(result, path): result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None) with open(path+"/temporary/blub_distances.csv") as csvfile: readCSV = csv.reader(csvfile, delimiter=';') result = list(readCSV) dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result]) dm_flattened = dm.flatten() dm_flattened = dm_flattened[dm_flattened != 0] dm_flattened = dm_flattened.round(decimals=0) #dm_ordered = sorted(dm_flattened) #print(dm_ordered) median= np.median(dm_flattened) mode = stats.mode(dm_flattened) most_often= pandas.value_counts(dm_flattened) #x = itemfreq(dm_flattened) #print(pandas.DataFrame(most_often, columns=["first"]).columns) #print(x) print("Distance Mode:", mode) #print(most_often) #largest = most_often.nsmallest(15, "first") #print(largest) print("Distance Median:", median) #print(max_dist, min_dist) return dm def distance_knn(dm): knn = [] for row in dm: row = row[row != 0] row = row.round(decimals=2) row = sorted(row) knn.extend(row[:2]) return knn def avg_words_block_clustered(words, cluster): blocks = cluster avg_words = words/blocks #print(avg_words) return avg_words # def main(uuid, filepath): # path = "/home/bscheibel/PycharmProjects/clustering" # filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path) # result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename) # print("number_blocks:", number_blocks) # print("number_words:", number_words) # print("avg words/blocks", number_words/number_blocks) # size_blocks(result) # # result_df = get_average_xy(result, path) # # # result = clustering_precomputed_dbscan_og_without.get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays # #print(result) # result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None) # #with open(path+"/temporary/blub.csv") as csvfile: # # readCSV = csv.reader(csvfile, delimiter=';') # # result = list(readCSV) # dm = distance_btw_blocks(result_df, path) # knn = distance_knn(dm) # median= np.median(knn) # mode = stats.mode(knn) # min_nn = min(knn) # avg = np.sum(knn)/len(knn) # print("min_knn:", min_nn) # print("knn_mean:", avg) # print("knn_median:", median) # print("knn_mode:", mode) #knn.nearestNeighbors(dm) #main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/Werkstattzeichnung Zwischenwelle.pdf")