123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228 |
- from math import sqrt
- import numpy as np
- import pandas
- import csv
- #import math
- from clustering_precomputed_dbscan_noParallels import intersects
- from scipy import stats
- def get_average_xy(list_input, path):
- csv_name = path+"/temporary/list_to_csv_with_corner_points_distances.csv"
- resultFile = open(csv_name, 'w')
- wr = csv.writer(resultFile, delimiter=";")
- wr.writerow(["element","point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
- result_df = pandas.DataFrame(columns=["point_xmi_ymi","point_xma_ymi","point_xmi_yma","point_xma_yma"])
- #result_df = pandas.DataFrame(columns=["xmin","ymin","xmax","ymax"])
- for element in list_input:
- xavg_elem = 0
- yavg_elem = 0
- ymin = 100000000
- ymax = 0
- xmin = 100000000
- xmax = 0
- newList = []
- check = False
- if len(element) == 5 and not isinstance(element[0], list):
- newList.append(element)
- element = newList
- for blub in element: #get the smallest and largest x and y value for whole block
- if isinstance(blub[0],list) and len(blub[0])==5:
- blub = blub [0]
- if float(blub[1]) < ymin:
- ymin = float(blub[1])
- #print("y_min:",y_min)
- if float(blub[0]) < xmin:
- xmin = float(blub[0])
- if float(blub[3]) > ymax:
- ymax = float(blub[3])
- if float(blub[2]) > xmax:
- xmax = float(blub[2])
- point_xmi_ymi = [xmin,ymin]
- point_xma_ymi = [xmax,ymin]
- point_xmi_yma = [xmin,ymax]
- point_xma_yma = [xmax,ymax]
- wr.writerow([element, point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma])
- result_df.loc[len(result_df)] = [ point_xmi_ymi, point_xma_ymi, point_xmi_yma, point_xma_yma]
- resultFile.close()
- #result_df.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
- return result_df
- def dist(rectangle1, rectangle2):
- #get minimal distance between two rectangles
- distance = 100000000
- #print(rectangle2,rectangle1)
- for point1 in rectangle1[:4]:
- #print(point1)
- point1 = eval(point1) #necessary to convert [] to real tuple
- for point2 in rectangle2[:4]:
- #print(point2)
- point2 = eval(point2)
- dist = sqrt(((float(point2[0]) - float(point1[0])))**2 + ((float(point2[1]) - float(point1[1])))**2)
- if dist < distance:
- distance = dist
- if intersects(rectangle1, rectangle2):
- distance = 0
- return distance
- def size_blocks(list_input): #x, y distance of blocks (not regarding words)
- #print(list_input)
- min_size_x = 1000000000
- max_size_x = 0
- min_size_y = 1000000000
- max_size_y = 0
- x_size = []
- y_size = []
- diagonal = []
- for element in list_input:
- newList = []
- ymin = 1000000000
- ymax = 0
- xmin = 1000000000
- xmax = 0
- if len(element) == 5 and not isinstance(element[0], list):
- newList.append(element)
- element = newList
- for blub in element: # get the smallest and largest x and y value for whole block, block sizes
- print(blub)
- if isinstance(blub[0], list) and len(blub[0]) == 5:
- blub = blub[0]
- if float(blub[1]) < ymin:
- ymin = float(blub[1])
- # print("y_min:",y_min)
- if float(blub[0]) < xmin:
- xmin = float(blub[0])
- if float(blub[3]) > ymax:
- ymax = float(blub[3])
- if float(blub[2]) > xmax:
- xmax = float(blub[2])
- distance_x = xmax-xmin
- distance_y = ymax-ymin
- diagonal_ = math.sqrt(distance_x ** 2 + distance_y ** 2) # satz der pythogoras
- diagonal.append(diagonal_)
- x_size.append(distance_x)
- #print(distance_x, blub[4])
- #print(distance_y, blub[4])
- y_size.append(distance_y)
- if distance_x < min_size_x:
- min_size_x = distance_x
- if distance_x > max_size_x:
- max_size_x = distance_x
- if distance_y < min_size_y:
- min_size_y = distance_y
- if distance_y > max_size_y:
- max_size_y = distance_y
- x_size = np.array(x_size)
- x_size = x_size.round(decimals=0)
- #print(x_size)
- median_size_x = np.median(x_size)
- modus_size_x = stats.mode(x_size)
- y_size = np.array(y_size)
- y_size = y_size.round(decimals=0)
- #print(y_size)
- median_size_y = np.median(y_size)
- modus_size_y = stats.mode(y_size)
- #print(min_size_x,max_size_x, min_size_y, max_size_y)
- print("Size_Median_x:", median_size_x)
- print("Size_Median_y:", median_size_y)
- print("Size_Modus_x:", modus_size_x)
- print("Size_Modus_y:", modus_size_y)
- return min_size_x,max_size_x, min_size_y, max_size_y, diagonal
- def distance_btw_blocks(result, path):
- result.to_csv(path+"/temporary/blub_distances.csv", sep=";", index=False, header=None)
- with open(path+"/temporary/blub_distances.csv") as csvfile:
- readCSV = csv.reader(csvfile, delimiter=';')
- result = list(readCSV)
- dm = np.asarray([[dist(p1, p2) for p2 in result] for p1 in result])
- dm_flattened = dm.flatten()
- dm_flattened = dm_flattened[dm_flattened != 0]
- dm_flattened = dm_flattened.round(decimals=0)
- #dm_ordered = sorted(dm_flattened)
- #print(dm_ordered)
- median= np.median(dm_flattened)
- mode = stats.mode(dm_flattened)
- most_often= pandas.value_counts(dm_flattened)
- #x = itemfreq(dm_flattened)
- #print(pandas.DataFrame(most_often, columns=["first"]).columns)
- #print(x)
- print("Distance Mode:", mode)
- #print(most_often)
- #largest = most_often.nsmallest(15, "first")
- #print(largest)
- print("Distance Median:", median)
- #print(max_dist, min_dist)
- return dm
- def distance_knn(dm):
- knn = []
- for row in dm:
- row = row[row != 0]
- row = row.round(decimals=2)
- row = sorted(row)
- knn.extend(row[:2])
- return knn
- def avg_words_block_clustered(words, cluster):
- blocks = cluster
- avg_words = words/blocks
- #print(avg_words)
- return avg_words
- # def main(uuid, filepath):
- # path = "/home/bscheibel/PycharmProjects/clustering"
- # filename = order_bounding_boxes_in_each_block.pdf_to_html(uuid, filepath, path)
- # result, number_blocks, number_words = order_bounding_boxes_in_each_block.get_bound_box(filename)
- # print("number_blocks:", number_blocks)
- # print("number_words:", number_words)
- # print("avg words/blocks", number_words/number_blocks)
- # size_blocks(result)
- #
- # result_df = get_average_xy(result, path)
- #
- #
- # result = clustering_precomputed_dbscan_og_without.get_average_xy(result, path) #input: array of arrays, output: either csv file or array of arrays
- # #print(result)
- # result.to_csv(path+"/temporary/blub.csv", sep=";", index=False, header=None)
- # #with open(path+"/temporary/blub.csv") as csvfile:
- # # readCSV = csv.reader(csvfile, delimiter=';')
- # # result = list(readCSV)
- # dm = distance_btw_blocks(result_df, path)
- # knn = distance_knn(dm)
- # median= np.median(knn)
- # mode = stats.mode(knn)
- # min_nn = min(knn)
- # avg = np.sum(knn)/len(knn)
- # print("min_knn:", min_nn)
- # print("knn_mean:", avg)
- # print("knn_median:", median)
- # print("knn_mode:", mode)
- #knn.nearestNeighbors(dm)
- #main("33333", "/home/bscheibel/PycharmProjects/clustering/drawings/Werkstattzeichnung Zwischenwelle.pdf")
|