5 years ago · 69fda4807a
--- a/cluster_by_distance_csv_only.py
+++ b/cluster_by_distance_csv_only.py
--- a/csv_to_pandadf.py
+++ b/csv_to_pandadf.py
@@ -7,19 +7,23 @@ import re
 
				 #data = data_df[["X1","Y1","X2","Y2"]]
			
 
				 #print(data)
			
 
				 
			
 
				-text = []
			
 
				-with open('text_merged.csv', 'r') as csvFile:
			
 
				-    reader = csv.reader(csvFile, delimiter=",")
			
 
				-    for row in reader:
			
 
				-        text.append(row[2])
			
 
				-csvFile.close()
			
 
				-###extract ISOs
			
 
				-matches = []
			
 
				-regex = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)"
			
 
				-for line in text:
			
 
				-    match = re.findall(regex, line)
			
 
				-    if match:
			
 
				-        matches.append(match)
			
 
				 
			
 
				-print(matches)
			
 
				+def read_csv(file):
			
 
				+    text = []
			
 
				+    with open(file, 'r') as csvFile:
			
 
				+        reader = csv.reader(csvFile, delimiter=",")
			
 
				+        for row in reader:
			
 
				+            text.append(row[2])
			
 
				+    csvFile.close()
			
 
				+    ###extract ISOs
			
 
				+    matches = []
			
 
				+    regex = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)"
			
 
				+    for line in text:
			
 
				+        match = re.findall(regex, line)
			
 
				+        if match:
			
 
				+            matches.append(match)
			
 
				 
			
 
				+    print(matches)
			
 
				+
			
 
				+
			
 
				+    return text
			
--- a/dbscan_clustering.py
+++ b/dbscan_clustering.py
@@ -10,15 +10,15 @@ from sklearn.preprocessing import StandardScaler
 
				 
			
 
				 
			
 
				 # #############################################################################
			
 
				-data_df = pandas.read_csv("values.csv", sep=",")
			
 
				+data_df = pandas.read_csv("temporary/5129275_extracted.csv", sep=";")
			
 
				 data_df.head(3)
			
 
				-data = data_df[["X1","Y1","X2","Y2"]]
			
 
				+data = data_df[["X","Y"]]
			
 
				 print(data)
			
 
				 data = StandardScaler().fit_transform(data)
			
 
				 
			
 
				 # #############################################################################
			
 
				 # Compute DBSCAN
			
 
				-db = DBSCAN(eps=0.15, min_samples=1).fit(data)
			
 
				+db = DBSCAN(eps=0.2, min_samples=1).fit(data)
			
 
				 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
			
 
				 core_samples_mask[db.core_sample_indices_] = True
			
 
				 labels = db.labels_
			
@@ -61,5 +61,5 @@ plt.title('Estimated number of clusters: %d' % n_clusters_)
 
				 plt.show()
			
 
				 
			
 
				 print(data_df.head(3))
			
 
				-data_df.to_csv("values_clustered.csv")
			
 
				-data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged.csv")
			
 
				+data_df.to_csv("values_clustered_GV12.csv")
			
 
				+data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged_GV12.csv")
			
--- a/find_groups.py
+++ b/find_groups.py
@@ -1,30 +0,0 @@
 
				-from itertools import combinations, product
			
 
				-def canMerge (g, h):
			
 
				-    for i, j in g:
			
 
				-        for x, y in h:
			
 
				-            if abs(i - x) <= 1 and abs(j - y) <= 1:
			
 
				-                return True
			
 
				-    return False
			
 
				-
			
 
				-def findGroups (field):
			
 
				-    # initialize one-element groups
			
 
				-    groups = [[(i, j)] for i, j in product(range(len(field)), range(len(field[0]))) if field[i][j] != '  ']
			
 
				-
			
 
				-    # keep joining until no more joins can be executed
			
 
				-    merged = True
			
 
				-    while merged:
			
 
				-        merged = False
			
 
				-        for g, h in combinations(groups, 2):
			
 
				-            if canMerge(g, h):
			
 
				-                g.extend(h)
			
 
				-                groups.remove(h)
			
 
				-                merged = True
			
 
				-                break
			
 
				-
			
 
				-    return groups
			
 
				-
			
 
				-# intialize field
			
 
				-field = "drawings/5129275_Rev01-GV12.txt"
			
 
				-groups = findGroups(field)
			
 
				-
			
 
				-print((groups)) # 3
			
--- a/main.py
+++ b/main.py
@@ -1,10 +1,10 @@
 
				 import read_data
			
 
				-import read_text_lines
			
 
				+import read_text_lines_from_dxf
			
 
				 #import merge_pandas
			
 
				 
			
 
				 
			
 
				 
			
 
				 file = "drawings/5152166.dxf"
			
 
				 file_out = "5152166_extracted.csv"
			
 
				-read_text_lines.read(file, file_out)
			
 
				+read_text_lines_from_dxf.read(file, file_out)
			
 
				 read_data.read_dimensions(file_out, 0)
			
--- a/old/blob_dedection.py
+++ b/old/blob_dedection.py
--- a/output_image.png
+++ b/output_image.png
--- a/read_text_lines_from_dxf.py
+++ b/read_text_lines_from_dxf.py
--- a/regex_extraction.py
+++ b/regex_extraction.py
@@ -1,67 +1,101 @@
 
				 # coding=utf8
			
 
				 import re
			
 
				+import csv_to_pandadf
			
 
				 
			
 
				 def clean(extracted_dimensions):
			
 
				-    #next part extracts the isos and removes everything we dont need like just text or the X:X stuff, einzelne buchstaben und zahlen
			
 
				+    #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
			
 
				     for dim in extracted_dimensions:
			
 
				         if re.match(regex_isos, dim): #isos
			
 
				             match = re.findall(regex_isos,dim)
			
 
				             isos.append(match[0])
			
 
				             extracted_dimensions.remove(dim)
			
 
				 
			
 
				-
			
 
				-    for dim in extracted_dimensions:
			
 
				-        match =re.match(reg_all, dim)
			
 
				-        if match:
			
 
				-            #print(re.findall(reg_all,dim))
			
 
				-            #print(match[0])
			
 
				-            try:
			
 
				-                extracted_dimensions.remove(dim)
			
 
				-            except:
			
 
				-                print("error")
			
 
				+    i = 0
			
 
				+    new_matches = []
			
 
				+    for match in extracted_dimensions:
			
 
				+        # print(match)
			
 
				+        match = match.split('\n')[0]
			
 
				+        # if len(match)>1:
			
 
				+        #    extraction.append(match[1])
			
 
				+        # print(match[1])
			
 
				+        if not re.search(reg_all, match):
			
 
				+            new_matches.append(match)
			
 
				+        i += 1
			
 
				 
			
 
				     #print(isos)
			
 
				     #print(extracted_dimensions)
			
 
				-    return isos, extracted_dimensions
			
 
				+    return isos, new_matches
			
 
				 
			
 
				 
			
 
				-def print_clean(extracted_dimensions):
			
 
				-    for dim in extracted_dimensions:
			
 
				-        if "b" in dim:
			
 
				+def print_clean(dims):
			
 
				+    mal = "no"
			
 
				+    vorzeichen = "no"
			
 
				+    for dim in dims:
			
 
				+        if re.match(r"b\s\d*\W?\d*\s.",dim):
			
 
				             print("Rechtwinkligkeit")
			
 
				             print(dim)
			
 
				-        if "g" in dim:
			
 
				+            continue
			
 
				+        if re.match(r"g\s\d*\W?\d*", dim):
			
 
				             print("Zylinderform")
			
 
				             print(dim)
			
 
				-        if "f" in dim:
			
 
				+            continue
			
 
				+        if re.match(r"g\s\d*\W?\d*", dim):
			
 
				             print("Parallelität")
			
 
				             print(dim)
			
 
				-        if "c" in dim:
			
 
				+            continue
			
 
				+        if re.match(r"g\s\d*\W?\d*", dim):
			
 
				             print("Zylinderform")
			
 
				             print(dim)
			
 
				-        if "r" in dim:
			
 
				-            print("Konzentrizität?")
			
 
				+            continue
			
 
				+        if re.match(r"g\s\d*\W?\d*", dim):
			
 
				+            print("Konzentrizität")
			
 
				             print(dim)
			
 
				-        if "i" in dim:
			
 
				+            continue
			
 
				+        if re.match(r"i\s\d*\W?\d*", dim):
			
 
				             print("Symmetrie")
			
 
				             print(dim)
			
 
				-        if "j" in dim:
			
 
				+            continue
			
 
				+        if re.match(r"j\s\d*\W?\d*", dim):
			
 
				             print("Ortstoleranz/Mittelpunkt")
			
 
				             print(dim)
			
 
				-        if "n" in dim:
			
 
				+        if re.match(r"n\d*", dim):
			
 
				             print("Durchmesser")
			
 
				             print(dim)
			
 
				         if "É" in dim:
			
 
				             print("Modifikator")
			
 
				             print(dim)
			
 
				+            continue
			
 
				         ####nicht dabei: neigungswinkel und lauftoleranzen
			
 
				-        if "R" in dim:
			
 
				+        if re.match(r"R\d*$",dim):
			
 
				             print("Radius")
			
 
				             print(dim)
			
 
				+            continue
			
 
				         if "°" in dim:
			
 
				             print("Grad")
			
 
				-        if "Ø" in dim:
			
 
				+            print(dim)
			
 
				+            continue
			
 
				+        if re.match(r"Ø\s*\d*\W?\d*", dim):
			
 
				             print("Durchmesser")
			
 
				+            print(dim)
			
 
				+            continue
			
 
				+
			
 
				+def merge(dims):
			
 
				+    last_item = ""
			
 
				+    i = 0
			
 
				+    new_dims = []
			
 
				+    for dim in dims:
			
 
				+        dims[i] = dim.replace('È','GG')
			
 
				+        if re.match(r"\d?x$", last_item):
			
 
				+            last_item = last_item + " " + dims[i]
			
 
				+        if re.match(r"R0", dim):
			
 
				+            last_item = dim + last_item
			
 
				+        if re.match(r"^°$", last_item):
			
 
				+            last_item = dim + last_item
			
 
				+        new_dims.append(last_item)
			
 
				+        i += 1
			
 
				+        last_item = dim
			
 
				+    return dims
			
 
				+
			
 
				 
			
 
				 regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
			
 
				 regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
			
@@ -69,19 +103,28 @@ regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
 
				 regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
			
 
				 reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
			
 
				 reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
			
 
				-reg_all = r"(^(?!0)\d{1}$)|(^[A-Z]{1}-?[A-Z]?$)|(^[A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,})"
			
 
				+reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
			
 
				 extracted_dimensions = []
			
 
				-file = open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.txt', 'r')
			
 
				+
			
 
				+
			
 
				+text = csv_to_pandadf.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
			
 
				+
			
 
				+"""file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
			
 
				 text = file.read()
			
 
				 file.close()
			
 
				-matches = re.findall(regex, text, re.MULTILINE)
			
 
				-for match in matches:
			
 
				+matches = re.findall(regex, text, re.MULTILINE) """
			
 
				+for match in text:
			
 
				     extracted_dimensions.append(match.strip())
			
 
				 #print(extracted_dimensions)
			
 
				+"""for dim in extracted_dimensions:
			
 
				+    print( [dim] )"""
			
 
				 isos = []
			
 
				 isos, dims = clean(extracted_dimensions)
			
 
				-#print(isos)
			
 
				-#dims = clean(dims)
			
 
				 for dim in dims:
			
 
				-    print(dim)
			
 
				-print_clean(dims)
			
 
				+   print(dim)
			
 
				+#print(isos)
			
 
				+new_dims = []
			
 
				+new_dims = merge(dims)
			
 
				+print(new_dims)
			
 
				+
			
 
				+#print_clean(dims)
			
--- a/regex_online_tester.py
+++ b/regex_online_tester.py
@@ -1,116 +0,0 @@
 
				-# coding=utf8
			
 
				-# the above tag defines encoding for this document and is for Python 2.x compatibility
			
 
				-
			
 
				-import re
			
 
				-
			
 
				-regex = r"(^(?!0)\d{1}$)|(^[A-Z]{1}-?[A-Z]?$)|(^[A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,})"
			
 
				-
			
 
				-test_str = ("2"
			
 
				-            "4"
			
 
				-            "6"
			
 
				-            "8"
			
 
				-            "Z ( 20 : 1 )"
			
 
				-            "b 0,01 A\n"
			
 
				-            "2x\n"
			
 
				-            "r Ø0,05 CZ B\n"
			
 
				-            "g 0,01\n"
			
 
				-            "R2\n"
			
 
				-            "5°\n"
			
 
				-            ",8\n"
			
 
				-            "R0\n"
			
 
				-            "Ø 4,1\n"
			
 
				-            "B\n"
			
 
				-            "-0,1\n"
			
 
				-            "2x Ø19,2 +- 00,05 CT\n"
			
 
				-            "-0,05\n"
			
 
				-            "Ø15,15 +- 0,05\n"
			
 
				-            "1,65\n"
			
 
				-            "-0,1\n"
			
 
				-            "0\n"
			
 
				-            "n11\n"
			
 
				-            ",5\n"
			
 
				-            "R0,\n"
			
 
				-            "R0\n"
			
 
				-            "C\n"
			
 
				-            "C\n"
			
 
				-            "Rz 1,6\n"
			
 
				-            "Rpk 0,2\n"
			
 
				-            "°\n"
			
 
				-            "45\n"
			
 
				-            "E #F\n"
			
 
				-            "Ø4,5 x7 (++ 0,028) È\n"
			
 
				-            "f 0,01 A\n"
			
 
				-            "0,7\n"
			
 
				-            "b 0,01 A\n"
			
 
				-            "c 0,01\n"
			
 
				-            "r Ø0,05 B\n"
			
 
				-            "0,04\n"
			
 
				-            "F\n"
			
 
				-            "1,5 +- 0,1\n"
			
 
				-            "D\n"
			
 
				-            "0\n"
			
 
				-            "dimensions apply to the finished part\n"
			
 
				-            "2 +- 0,1\n"
			
 
				-            "Material: 1.4021\n"
			
 
				-            "c 0,005\n"
			
 
				-            "9,3 +- 0,1\n"
			
 
				-            "Ra 0,8\n"
			
 
				-            "4,8 `0,05\n"
			
 
				-            "0\n"
			
 
				-            "Raw part No:\n"
			
 
				-            "-\n"
			
 
				-            "Rz 0,25\n"
			
 
				-            "Edge finish:\n"
			
 
				-            "-0,2\n"
			
 
				-            "Rz 16\n"
			
 
				-            "-0,05\n"
			
 
				-            "for machined surface\n"
			
 
				-            "B\n"
			
 
				-            "principales and rules according to:\n"
			
 
				-            "1   Start drawing\n"
			
 
				-            "-\n"
			
 
				-            "of form, orientation, location and run-out:\n"
			
 
				-            "DESCRIPTION\n"
			
 
				-            "DRAWN BY INSP. DATE\n"
			
 
				-            "Edge finish according to:\n"
			
 
				-            "REVISION HISTORY\n"
			
 
				-            "Copyright\n"
			
 
				-            "    16%\n"
			
 
				-            "not applicable\n"
			
 
				-            "this document as well as the\n"
			
 
				-            "without\n"
			
 
				-            "    Nominal\n"
			
 
				-            "file:\n"
			
 
				-            "be held liable for the payment\n"
			
 
				-            "the event\n"
			
 
				-            "A\n"
			
 
				-            "A\n"
			
 
				-            "of the grant of a patent,\n"
			
 
				-            "Scale:\n"
			
 
				-            "Projection:\n"
			
 
				-            "Valid for all untoleranced dimensions:\n"
			
 
				-            "d 0,2 A B\n"
			
 
				-            "-\n"
			
 
				-            "2:1\n"
			
 
				-            "mm\n"
			
 
				-            "Description:\n"
			
 
				-            "F   Cleanliness:\n"
			
 
				-            "11.07.2018\n"
			
 
				-            "F\n"
			
 
				-            "    Part\n"
			
 
				-            "of deposits, detachable\n"
			
 
				-            "Design Freeze Release B1 -Sample"
			
 
				-            "Page:\n"
			
 
				-            "    Delivery\n"
			
 
				-            "tbd\n"
			
 
				-            "ENGINE Division"
			
 
				-            "5129275\n"
			
 
				-            "1/1\n"
			
 
				-            "2\n"
			
 
				-            "4\n"
			
 
				-            "6\n"
			
 
				-            "A3\n")
			
 
				-
			
 
				-matches = re.sub(regex,"" ,test_str)
			
 
				-
			
 
				-print(test_str)
			
--- a/regex_test.py
+++ b/regex_test.py
@@ -0,0 +1,27 @@
 
				+import re
			
 
				+reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
			
 
				+regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)"
			
 
				+file = open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.txt', 'r')
			
 
				+text = file.read()
			
 
				+file.close()
			
 
				+extraction = []
			
 
				+matches = re.findall(regex, text, re.MULTILINE)
			
 
				+for match in matches:
			
 
				+    #print(match)
			
 
				+    extraction.append(match.strip())
			
 
				+i = 0
			
 
				+new_matches = []
			
 
				+for match in extraction:
			
 
				+    #print(match)
			
 
				+    #print("blub")
			
 
				+    match = match.split('\n')[0]
			
 
				+    #if len(match)>1:
			
 
				+    #    extraction.append(match[1])
			
 
				+        #print(match[1])
			
 
				+    #print([match])
			
 
				+    if not re.search(reg_all, match):
			
 
				+        #print("blub")
			
 
				+        new_matches.append(match)
			
 
				+    i += 1
			
 
				+
			
 
				+print(new_matches)
			
--- a/5129275_extracted.csv
+++ b/5129275_extracted.csv
@@ -1,3 +1,4 @@
 
				+Text;X;Y
			
 
				 CELLSTYLEMAP;210;148
			
 
				 "\fArial|b1|i0;\H2.0000;Angelehnt an HN810";-134;50
			
 
				 "\fArial|b1|i0;\H2.0000;Losgröße (Stk.)";-123;45
			
--- a/temporary/5152166_extracted.csv
+++ b/temporary/5152166_extracted.csv
--- a/temporary/file_out.csv
+++ b/temporary/file_out.csv
--- a/temporary/merged_values.csv
+++ b/temporary/merged_values.csv
--- a/temporary/text_merged.csv
+++ b/temporary/text_merged.csv
--- a/temporary/text_merged_GV12.csv
+++ b/temporary/text_merged_GV12.csv
--- a/temporary/values.csv
+++ b/temporary/values.csv
--- a/temporary/values_clustered.csv
+++ b/temporary/values_clustered.csv
--- a/temporary/values_clustered_GV12.csv
+++ b/temporary/values_clustered_GV12.csv