Browse Source

added files

bscheibel 4 years ago
parent
commit
06500c09a2

+ 35 - 5
dbscan_clustering.py

@@ -1,5 +1,7 @@
 import numpy as np
 import pandas
+import csv
+import order_bounding_boxes_in_each_block
 
 from sklearn.cluster import DBSCAN
 from sklearn import metrics
@@ -8,15 +10,15 @@ from sklearn.preprocessing import StandardScaler
 
 def cluster(file_in, file_out):
     # #############################################################################
-    data_df = pandas.read_csv("values_fromhtml_GV12.csv", sep=",")
+    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_avg_points.csv", sep=";")
     data_df.head(3)
-    data = data_df[["X1","Y1","X2","Y2"]]
+    data = data_df[["xavg_elem","yavg_elem"]]
     print(data)
     data = StandardScaler().fit_transform(data)
 
     # #############################################################################
     # Compute DBSCAN
-    db = DBSCAN(eps=0.2, min_samples=1).fit(data)
+    db = DBSCAN(eps=0.1, min_samples=1).fit(data)
     core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
     core_samples_mask[db.core_sample_indices_] = True
     labels = db.labels_
@@ -58,9 +60,37 @@ def cluster(file_in, file_out):
     plt.title('Estimated number of clusters: %d' % n_clusters_)
     plt.show()"""
 
-    print(data_df.head(3))
+    #print(data_df.head(3))
     #data_df.to_csv("values_clusteredfromPDF_GV12.csv")
-    data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("values_clusteredfromPDF_GV12.csv")
+    data_df.groupby('cluster')['element'].apply(' '.join).reset_index().to_csv("values_clusteredfromHTML_layout_LH.csv", delimiter=";")
 
 
+def get_average_xy(list_input):
+    csv_name = "temporary/list_to_csv_with_avg_points.csv"
+    new_list = []
+    resultFile = open(csv_name, 'a')
+    wr = csv.writer(resultFile, delimiter=";")
+    wr.writerow(["element", "xavg_elem","yavg_elem"])
+    for element in list_input:
+        xavg_elem = 0
+        yavg_elem = 0
+        for blub in element:
+            xavg_elem += (float(blub[0]) + float(blub[2]))/2
+            yavg_elem += (float(blub[1]) + float(blub[3]))/2
+        xavg_elem = xavg_elem/len(element)
+        #print(xavg_elem)
+        yavg_elem = yavg_elem/len(element)
+        #element.extend([xavg_elem, yavg_elem])
+        #print(element)
+        #new_list.append(element)
+        wr.writerow([element,xavg_elem,yavg_elem])
+
+    resultFile.close()
+    #print(new_list)
+    return csv_name
+
+
+#cluster(33,33)
+#result = order_bounding_boxes_in_each_block.get_bound_box()
+#get_average_xy(result)
 cluster(33,33)

File diff suppressed because it is too large
+ 1222 - 12
drawings/5129275_Rev01-GV12.html


File diff suppressed because it is too large
+ 2891 - 827
drawings/5152166_Rev04.html


+ 50 - 0
order_bounding_boxes_in_each_block.py

@@ -0,0 +1,50 @@
+### FIRST READ EACH BLOCK IN AN ARRAY
+
+from bs4 import BeautifulSoup
+
+def get_bound_box():
+    response = open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5152166_Rev04.html')
+    html_doc = response.read()
+    response.close()
+    html_file = BeautifulSoup(html_doc, 'html.parser')
+
+    all_elements = []
+    blocks = html_file.findAll('block')
+    for block in blocks:
+        list_elements = []
+        words = block.findAll('word')
+        for word in words:
+            word_list = []
+            word_list.append(word["xmin"])
+            word_list.append(word["ymin"])
+            word_list.append(word["xmax"])
+            word_list.append(word["ymax"])
+            word_list.append(word.string)
+            list_elements.append(word_list)
+        all_elements.append(list_elements)
+
+
+    #### NEXT SORT ELEMENTS IN EACH BLOCK BY THEIR X AND Y COORDINATES
+    #### FIRST TRYING XMIN und YMAX
+    ###FIRST CHECKING IF THE ELEMENTS ARE VERTICAL, IF YES THEN NO SORTING
+    new_all_elements = []
+
+    for element in all_elements:
+        later_bigger = (float(element[-1][0])-(float(element[0][0]))) #check if xmin from first element is bigger than xmin from last element
+        abstand_x = abs(float(element[-1][0])-(float(element[0][2])))
+        abstand_y = abs(float(element[-1][3])-float(element[0][1]))
+        if later_bigger >= -5:
+            #print(abstand_x-abstand_y)
+            new_all_elements.append(element)
+        else:
+            new_element = sorted(element, key=lambda k: [float(k[0])])
+            new_all_elements.append(new_element)
+
+
+    for element in new_all_elements:
+        for blub in element:
+            print(blub[4])
+
+        print("\n")
+
+    return new_all_elements

+ 0 - 0
output.csv


+ 7 - 0
read_from_clustered_merged.py

@@ -0,0 +1,7 @@
+import csv
+
+
+with open("/home/bscheibel/PycharmProjects/dxf_reader/values_clusteredfromHTML_layout_LH.csv", "r") as f:
+    reader = csv.reader(f, delimiter=",")
+    for row in reader:
+        print(row[2])

File diff suppressed because it is too large
+ 331 - 0
temporary/list_to_csv_with_avg_points.csv


File diff suppressed because it is too large
+ 181 - 0
values_clusteredfromHTML_layout_LH.csv