bscheibel 4 yıl önce
ebeveyn
işleme
69fda4807a

cluster_test1.py → cluster_by_distance_csv_only.py


+ 18 - 14
csv_to_pandadf.py

@@ -7,19 +7,23 @@ import re
 #data = data_df[["X1","Y1","X2","Y2"]]
 #print(data)
 
-text = []
-with open('text_merged.csv', 'r') as csvFile:
-    reader = csv.reader(csvFile, delimiter=",")
-    for row in reader:
-        text.append(row[2])
-csvFile.close()
-###extract ISOs
-matches = []
-regex = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)"
-for line in text:
-    match = re.findall(regex, line)
-    if match:
-        matches.append(match)
 
-print(matches)
+def read_csv(file):
+    text = []
+    with open(file, 'r') as csvFile:
+        reader = csv.reader(csvFile, delimiter=",")
+        for row in reader:
+            text.append(row[2])
+    csvFile.close()
+    ###extract ISOs
+    matches = []
+    regex = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)"
+    for line in text:
+        match = re.findall(regex, line)
+        if match:
+            matches.append(match)
 
+    print(matches)
+
+
+    return text

+ 5 - 5
dbscan_clustering.py

@@ -10,15 +10,15 @@ from sklearn.preprocessing import StandardScaler
 
 
 # #############################################################################
-data_df = pandas.read_csv("values.csv", sep=",")
+data_df = pandas.read_csv("temporary/5129275_extracted.csv", sep=";")
 data_df.head(3)
-data = data_df[["X1","Y1","X2","Y2"]]
+data = data_df[["X","Y"]]
 print(data)
 data = StandardScaler().fit_transform(data)
 
 # #############################################################################
 # Compute DBSCAN
-db = DBSCAN(eps=0.15, min_samples=1).fit(data)
+db = DBSCAN(eps=0.2, min_samples=1).fit(data)
 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
 core_samples_mask[db.core_sample_indices_] = True
 labels = db.labels_
@@ -61,5 +61,5 @@ plt.title('Estimated number of clusters: %d' % n_clusters_)
 plt.show()
 
 print(data_df.head(3))
-data_df.to_csv("values_clustered.csv")
-data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged.csv")
+data_df.to_csv("values_clustered_GV12.csv")
+data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged_GV12.csv")

+ 0 - 30
find_groups.py

@@ -1,30 +0,0 @@
-from itertools import combinations, product
-def canMerge (g, h):
-    for i, j in g:
-        for x, y in h:
-            if abs(i - x) <= 1 and abs(j - y) <= 1:
-                return True
-    return False
-
-def findGroups (field):
-    # initialize one-element groups
-    groups = [[(i, j)] for i, j in product(range(len(field)), range(len(field[0]))) if field[i][j] != '  ']
-
-    # keep joining until no more joins can be executed
-    merged = True
-    while merged:
-        merged = False
-        for g, h in combinations(groups, 2):
-            if canMerge(g, h):
-                g.extend(h)
-                groups.remove(h)
-                merged = True
-                break
-
-    return groups
-
-# intialize field
-field = "drawings/5129275_Rev01-GV12.txt"
-groups = findGroups(field)
-
-print((groups)) # 3

+ 2 - 2
main.py

@@ -1,10 +1,10 @@
 import read_data
-import read_text_lines
+import read_text_lines_from_dxf
 #import merge_pandas
 
 
 
 file = "drawings/5152166.dxf"
 file_out = "5152166_extracted.csv"
-read_text_lines.read(file, file_out)
+read_text_lines_from_dxf.read(file, file_out)
 read_data.read_dimensions(file_out, 0)

blob_dedection.py → old/blob_dedection.py


BIN
output_image.png


read_text_lines.py → read_text_lines_from_dxf.py


+ 76 - 33
regex_extraction.py

@@ -1,67 +1,101 @@
 # coding=utf8
 import re
+import csv_to_pandadf
 
 def clean(extracted_dimensions):
-    #next part extracts the isos and removes everything we dont need like just text or the X:X stuff, einzelne buchstaben und zahlen
+    #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
     for dim in extracted_dimensions:
         if re.match(regex_isos, dim): #isos
             match = re.findall(regex_isos,dim)
             isos.append(match[0])
             extracted_dimensions.remove(dim)
 
-
-    for dim in extracted_dimensions:
-        match =re.match(reg_all, dim)
-        if match:
-            #print(re.findall(reg_all,dim))
-            #print(match[0])
-            try:
-                extracted_dimensions.remove(dim)
-            except:
-                print("error")
+    i = 0
+    new_matches = []
+    for match in extracted_dimensions:
+        # print(match)
+        match = match.split('\n')[0]
+        # if len(match)>1:
+        #    extraction.append(match[1])
+        # print(match[1])
+        if not re.search(reg_all, match):
+            new_matches.append(match)
+        i += 1
 
     #print(isos)
     #print(extracted_dimensions)
-    return isos, extracted_dimensions
+    return isos, new_matches
 
 
-def print_clean(extracted_dimensions):
-    for dim in extracted_dimensions:
-        if "b" in dim:
+def print_clean(dims):
+    mal = "no"
+    vorzeichen = "no"
+    for dim in dims:
+        if re.match(r"b\s\d*\W?\d*\s.",dim):
             print("Rechtwinkligkeit")
             print(dim)
-        if "g" in dim:
+            continue
+        if re.match(r"g\s\d*\W?\d*", dim):
             print("Zylinderform")
             print(dim)
-        if "f" in dim:
+            continue
+        if re.match(r"g\s\d*\W?\d*", dim):
             print("Parallelität")
             print(dim)
-        if "c" in dim:
+            continue
+        if re.match(r"g\s\d*\W?\d*", dim):
             print("Zylinderform")
             print(dim)
-        if "r" in dim:
-            print("Konzentrizität?")
+            continue
+        if re.match(r"g\s\d*\W?\d*", dim):
+            print("Konzentrizität")
             print(dim)
-        if "i" in dim:
+            continue
+        if re.match(r"i\s\d*\W?\d*", dim):
             print("Symmetrie")
             print(dim)
-        if "j" in dim:
+            continue
+        if re.match(r"j\s\d*\W?\d*", dim):
             print("Ortstoleranz/Mittelpunkt")
             print(dim)
-        if "n" in dim:
+        if re.match(r"n\d*", dim):
             print("Durchmesser")
             print(dim)
         if "É" in dim:
             print("Modifikator")
             print(dim)
+            continue
         ####nicht dabei: neigungswinkel und lauftoleranzen
-        if "R" in dim:
+        if re.match(r"R\d*$",dim):
             print("Radius")
             print(dim)
+            continue
         if "°" in dim:
             print("Grad")
-        if "Ø" in dim:
+            print(dim)
+            continue
+        if re.match(r"Ø\s*\d*\W?\d*", dim):
             print("Durchmesser")
+            print(dim)
+            continue
+
+def merge(dims):
+    last_item = ""
+    i = 0
+    new_dims = []
+    for dim in dims:
+        dims[i] = dim.replace('È','GG')
+        if re.match(r"\d?x$", last_item):
+            last_item = last_item + " " + dims[i]
+        if re.match(r"R0", dim):
+            last_item = dim + last_item
+        if re.match(r"^°$", last_item):
+            last_item = dim + last_item
+        new_dims.append(last_item)
+        i += 1
+        last_item = dim
+    return dims
+
 
 regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
 regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
@@ -69,19 +103,28 @@ regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
 regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
 reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
 reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
-reg_all = r"(^(?!0)\d{1}$)|(^[A-Z]{1}-?[A-Z]?$)|(^[A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,})"
+reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
 extracted_dimensions = []
-file = open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.txt', 'r')
+
+
+text = csv_to_pandadf.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
+
+"""file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
 text = file.read()
 file.close()
-matches = re.findall(regex, text, re.MULTILINE)
-for match in matches:
+matches = re.findall(regex, text, re.MULTILINE) """
+for match in text:
     extracted_dimensions.append(match.strip())
 #print(extracted_dimensions)
+"""for dim in extracted_dimensions:
+    print( [dim] )"""
 isos = []
 isos, dims = clean(extracted_dimensions)
-#print(isos)
-#dims = clean(dims)
 for dim in dims:
-    print(dim)
-print_clean(dims)
+   print(dim)
+#print(isos)
+new_dims = []
+new_dims = merge(dims)
+print(new_dims)
+
+#print_clean(dims)

+ 0 - 116
regex_online_tester.py

@@ -1,116 +0,0 @@
-# coding=utf8
-# the above tag defines encoding for this document and is for Python 2.x compatibility
-
-import re
-
-regex = r"(^(?!0)\d{1}$)|(^[A-Z]{1}-?[A-Z]?$)|(^[A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,})"
-
-test_str = ("2"
-            "4"
-            "6"
-            "8"
-            "Z ( 20 : 1 )"
-            "b 0,01 A\n"
-            "2x\n"
-            "r Ø0,05 CZ B\n"
-            "g 0,01\n"
-            "R2\n"
-            "5°\n"
-            ",8\n"
-            "R0\n"
-            "Ø 4,1\n"
-            "B\n"
-            "-0,1\n"
-            "2x Ø19,2 +- 00,05 CT\n"
-            "-0,05\n"
-            "Ø15,15 +- 0,05\n"
-            "1,65\n"
-            "-0,1\n"
-            "0\n"
-            "n11\n"
-            ",5\n"
-            "R0,\n"
-            "R0\n"
-            "C\n"
-            "C\n"
-            "Rz 1,6\n"
-            "Rpk 0,2\n"
-            "°\n"
-            "45\n"
-            "E #F\n"
-            "Ø4,5 x7 (++ 0,028) È\n"
-            "f 0,01 A\n"
-            "0,7\n"
-            "b 0,01 A\n"
-            "c 0,01\n"
-            "r Ø0,05 B\n"
-            "0,04\n"
-            "F\n"
-            "1,5 +- 0,1\n"
-            "D\n"
-            "0\n"
-            "dimensions apply to the finished part\n"
-            "2 +- 0,1\n"
-            "Material: 1.4021\n"
-            "c 0,005\n"
-            "9,3 +- 0,1\n"
-            "Ra 0,8\n"
-            "4,8 `0,05\n"
-            "0\n"
-            "Raw part No:\n"
-            "-\n"
-            "Rz 0,25\n"
-            "Edge finish:\n"
-            "-0,2\n"
-            "Rz 16\n"
-            "-0,05\n"
-            "for machined surface\n"
-            "B\n"
-            "principales and rules according to:\n"
-            "1   Start drawing\n"
-            "-\n"
-            "of form, orientation, location and run-out:\n"
-            "DESCRIPTION\n"
-            "DRAWN BY INSP. DATE\n"
-            "Edge finish according to:\n"
-            "REVISION HISTORY\n"
-            "Copyright\n"
-            "    16%\n"
-            "not applicable\n"
-            "this document as well as the\n"
-            "without\n"
-            "    Nominal\n"
-            "file:\n"
-            "be held liable for the payment\n"
-            "the event\n"
-            "A\n"
-            "A\n"
-            "of the grant of a patent,\n"
-            "Scale:\n"
-            "Projection:\n"
-            "Valid for all untoleranced dimensions:\n"
-            "d 0,2 A B\n"
-            "-\n"
-            "2:1\n"
-            "mm\n"
-            "Description:\n"
-            "F   Cleanliness:\n"
-            "11.07.2018\n"
-            "F\n"
-            "    Part\n"
-            "of deposits, detachable\n"
-            "Design Freeze Release B1 -Sample"
-            "Page:\n"
-            "    Delivery\n"
-            "tbd\n"
-            "ENGINE Division"
-            "5129275\n"
-            "1/1\n"
-            "2\n"
-            "4\n"
-            "6\n"
-            "A3\n")
-
-matches = re.sub(regex,"" ,test_str)
-
-print(test_str)

+ 27 - 0
regex_test.py

@@ -0,0 +1,27 @@
+import re
+reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
+regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)"
+file = open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.txt', 'r')
+text = file.read()
+file.close()
+extraction = []
+matches = re.findall(regex, text, re.MULTILINE)
+for match in matches:
+    #print(match)
+    extraction.append(match.strip())
+i = 0
+new_matches = []
+for match in extraction:
+    #print(match)
+    #print("blub")
+    match = match.split('\n')[0]
+    #if len(match)>1:
+    #    extraction.append(match[1])
+        #print(match[1])
+    #print([match])
+    if not re.search(reg_all, match):
+        #print("blub")
+        new_matches.append(match)
+    i += 1
+
+print(new_matches)

+ 1 - 0
5129275_extracted.csv

@@ -1,3 +1,4 @@
+Text;X;Y
 CELLSTYLEMAP;210;148
 "\fArial|b1|i0;\H2.0000;Angelehnt an HN810";-134;50
 "\fArial|b1|i0;\H2.0000;Losgröße (Stk.)";-123;45

5152166_extracted.csv → temporary/5152166_extracted.csv


file_out.csv → temporary/file_out.csv


merged_values.csv → temporary/merged_values.csv


text_merged.csv → temporary/text_merged.csv


Dosya farkı çok büyük olduğundan ihmal edildi
+ 71 - 0
temporary/text_merged_GV12.csv


values.csv → temporary/values.csv


values_clustered.csv → temporary/values_clustered.csv


Dosya farkı çok büyük olduğundan ihmal edildi
+ 1317 - 0
temporary/values_clustered_GV12.csv