Browse Source

print plus minus

beatescheibel 5 years ago
parent
commit
848af8acb8
7 changed files with 109 additions and 115 deletions
  1. 47 46
      dbscan_clustering.py
  2. 0 0
      old/csv_to_text.py
  3. 0 0
      old/merge_pandas.py
  4. 1 1
      read_html_to_csv.py
  5. 60 68
      regex_extraction.py
  6. 1 0
      temporary/extracted_GV_12.csv
  7. 0 0
      temporary/output.csv

+ 47 - 46
dbscan_clustering.py

@@ -1,5 +1,3 @@
-print(__doc__)
-
 import numpy as np
 import pandas
 
@@ -8,58 +6,61 @@ from sklearn import metrics
 from sklearn.datasets.samples_generator import make_blobs
 from sklearn.preprocessing import StandardScaler
 
+def cluster(file_in, file_out):
+    # #############################################################################
+    data_df = pandas.read_csv("values.csv", sep=",")
+    data_df.head(3)
+    data = data_df[["X1", "Y1","X2","Y2"]]
+    print(data)
+    data = StandardScaler().fit_transform(data)
+
+    # #############################################################################
+    # Compute DBSCAN
+    db = DBSCAN(eps=0.1, min_samples=1).fit(data)
+    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
+    core_samples_mask[db.core_sample_indices_] = True
+    labels = db.labels_
+    print(data[labels == 0])
+    data_df["cluster"] = labels
 
-# #############################################################################
-data_df = pandas.read_csv("temporary/5129275_extracted.csv", sep=";")
-data_df.head(3)
-data = data_df[["X","Y"]]
-print(data)
-data = StandardScaler().fit_transform(data)
+    # Number of clusters in labels, ignoring noise if present.
+    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
+    n_noise_ = list(labels).count(-1)
 
-# #############################################################################
-# Compute DBSCAN
-db = DBSCAN(eps=0.2, min_samples=1).fit(data)
-core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
-core_samples_mask[db.core_sample_indices_] = True
-labels = db.labels_
-print(data[labels == 0])
-data_df["cluster"] = labels
+    print('Estimated number of clusters: %d' % n_clusters_)
+    print('Estimated number of noise points: %d' % n_noise_)
+    print("Silhouette Coefficient: %0.3f"
+          % metrics.silhouette_score(data, labels))
 
-# Number of clusters in labels, ignoring noise if present.
-n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
-n_noise_ = list(labels).count(-1)
+    # #############################################################################
+    # Plot result
+    import matplotlib.pyplot as plt
 
-print('Estimated number of clusters: %d' % n_clusters_)
-print('Estimated number of noise points: %d' % n_noise_)
-print("Silhouette Coefficient: %0.3f"
-      % metrics.silhouette_score(data, labels))
+    # Black removed and is used for noise instead.
+    unique_labels = set(labels)
+    colors = [plt.cm.Spectral(each)
+              for each in np.linspace(0, 1, len(unique_labels))]
+    for k, col in zip(unique_labels, colors):
+        if k == -1:
+            # Black used for noise.
+            col = [0, 0, 0, 1]
 
-# #############################################################################
-# Plot result
-import matplotlib.pyplot as plt
+        class_member_mask = (labels == k)
 
-# Black removed and is used for noise instead.
-unique_labels = set(labels)
-colors = [plt.cm.Spectral(each)
-          for each in np.linspace(0, 1, len(unique_labels))]
-for k, col in zip(unique_labels, colors):
-    if k == -1:
-        # Black used for noise.
-        col = [0, 0, 0, 1]
+        xy = data[class_member_mask & core_samples_mask]
+        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+                 markeredgecolor='k', markersize=14)
 
-    class_member_mask = (labels == k)
+        xy = data[class_member_mask & ~core_samples_mask]
+        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+                 markeredgecolor='k', markersize=6)
 
-    xy = data[class_member_mask & core_samples_mask]
-    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
-             markeredgecolor='k', markersize=14)
+    plt.title('Estimated number of clusters: %d' % n_clusters_)
+    plt.show()
 
-    xy = data[class_member_mask & ~core_samples_mask]
-    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
-             markeredgecolor='k', markersize=6)
+    print(data_df.head(3))
+    data_df.to_csv("values_clustered.csv")
+    data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged.csv")
 
-plt.title('Estimated number of clusters: %d' % n_clusters_)
-plt.show()
 
-print(data_df.head(3))
-data_df.to_csv("values_clustered_GV12.csv")
-data_df.groupby('cluster')['Text'].apply(' '.join).reset_index().to_csv("text_merged_GV12.csv")
+cluster(33,33)

csv_to_text.py → old/csv_to_text.py


merge_pandas.py → old/merge_pandas.py


+ 1 - 1
read_html_to_csv.py

@@ -2,7 +2,7 @@ import re
 import csv
 
 
-with open("/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.html", "r") as f:
+with open("drawings/5152166_Rev04.html", "r") as f:
     with open('values.csv', 'w') as writeFile:
         for line in f.readlines():
             #print(line)

+ 60 - 68
regex_extraction.py

@@ -1,27 +1,20 @@
 # coding=utf8
 import re
-import csv_to_text
-import csv
 import pandas
 
 def clean(extracted_dimensions):
     #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
     isos = []
-    for dim in extracted_dimensions:
-        if re.match(regex_isos, dim): #isos
-            match = re.findall(regex_isos,dim)
-            print(match)
-            isos.append(match[0])
-            extracted_dimensions.remove(dim)
+    for line in extracted_dimensions:
+        matches = re.findall(regex_isos,line)
+        for match in matches:
+            isos.append(match)
+
 
     i = 0
     new_matches = []
     for match in extracted_dimensions:
-        # print(match)
         match = match.split('\n')[0]
-        # if len(match)>1:
-        #    extraction.append(match[1])
-        # print(match[1])
         if not re.search(reg_all, match):
             new_matches.append(match)
         i += 1
@@ -32,68 +25,70 @@ def clean(extracted_dimensions):
 
 
 def print_clean(dims):
+    dims_new = []
+    dimss = []
     for dim in dims:
-        if re.match(r"b\s\d*\W?\d*\s.",dim):
-            dim = dim.replace('b', '⏊')
-            continue
-        if re.match(r"g\s\d*\W?\d*", dim):
-            dim = dim.replace('g', '⌭ ')
-            continue
-        if re.match(r"f\s\d*\W?\d*", dim):
+        dim = re.split("CT",dim)
+        dimss.extend(dim)
+    #print(dimss)
+    for dim in dimss:
+        if re.search(r"b\s\d*\W?\d*\s.",dim):
+            dim = dim.replace('b', u"\u27C2")
+        if re.search(r"g\s\d*\W?\d*", dim):
+            dim = dim.replace('g', u"\u232D")
+        if re.search(r"f\s\d*\W?\d*", dim):
             dim = dim.replace('f',  u"\u2225")
-            continue
-        if re.match(r"r\s\d*\W?\d*", dim):
-            dim = dim.replace('r', '⌾')
-            continue
-        if re.match(r"i\s\d*\W?\d*", dim):
-            dim = dim.replace('i', '⌯')
-            continue
-        if re.match(r"j\s\d*\W?\d*", dim):
-            dim = dim.replace('j', '')
-            continue
-        if re.match(r"c\s+\d*", dim):
-            dim = dim.replace('c', '⏥')
-            continue
-        if re.match(r"n\s+\d*", dim):
-            dim = dim.replace('n', '⌀')
-            continue
-        if "É" in dim:
-            dim = dim.replace('É', 'GG')
-            continue
+        if re.search(r"r\s\d*\W?\d*", dim):
+            dim = dim.replace('r', u"\u25CE")
+        if re.search(r"i\s\d*\W?\d*", dim):
+            dim = dim.replace('i', u"\u232F")
+        if re.search(r"j\s\d*\W?\d*", dim):
+            dim = dim.replace('j', u"\u2316")
+        if re.search(r"d\s\d*\W?\d*", dim):
+            dim = dim.replace('d', u"\u2313")
+        if re.search(r"c\s+\d*", dim):
+            dim = dim.replace('c', u"\u23E5")
+        if re.search(r"n\s+\d*", dim):
+            dim = dim.replace('n', u"\u2300")
+        if "È" in dim:
+            dim = dim.replace('È', 'GG')
+        if "`" in dim:
+            dim = dim.replace('`', u"\u00B1")
+        if "#" in dim:
+            dim = dim.replace('#', "↔")
+        if "⌀" in dim:
+            dim = dim.replace('⌀', "Ø")
+        reg12 = re.compile(r"(\d{1,2}\.?\d{0,2})\s\+\s-\s(\d{1,2}\.?\d{0,2})\s(\d{1,2}\.?\d{0,2})")
+        g = re.search(reg12, dim)
+        if g:
+            dim = re.sub(reg12, g.group(1) + " + " + g.group(2) + " - " + g.group(3), dim)
+        dims_new.append(dim.strip())
+        dimms = []
+        i = 0
+        for dim in dims_new:
+            last_item = i - 1
+            next_item = i + 1
+            if not re.search(r"[a-zA-Z]{3,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$",dim):
+                dimms.append(dim)
+
 
         ####nicht dabei: neigungswinkel und lauftoleranzen
-    return dims
 
-def merge(dims):
-    last_item = ""
-    i = 0
-    new_dims = []
-    for dim in dims:
-        if re.match(r"\d?x$", last_item):
-            last_item = last_item + " " + dims[i]
-        if re.match(r"R0", dim):
-            last_item = dim + last_item
-        if re.match(r"^°$", last_item):
-            last_item = dim + last_item
-        new_dims.append(last_item)
-        i += 1
-        last_item = dim
-    return dims
+    return dimms
+
 
 
 regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
 regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
 regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
-regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
+regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)|(EN\s\d*)" #get iso standards
 reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
 reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
-reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
+reg_all = re.compile(r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?|(EN\s\d*)|^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d\s\d\s\d\s\d\s\d)|BY|to:?|of|or|is|in|as|be|by |\d\d\d\d\d\d\d|\d\s\/\s\d")
 extracted_dimensions = []
-
-
 #text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
 
-file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
+file = open('text_merged.csv', 'r')
 #text = file.read()
 #file.close()
 text_df = pandas.read_csv(file)
@@ -102,13 +97,10 @@ text = text_df['Text']
 #matches = re.findall(regex, text, re.MULTILINE)
 for line in text:
     extracted_dimensions.append(line.strip())
-#print(extracted_dimensions)
-#isos = []
-isos, dims = clean(extracted_dimensions)
-print(isos)
-#new_dims = []
-new_dims = merge(dims)
-print(new_dims)
 
-dims = print_clean(dims)
-print(dims)
+isos, dims = clean(extracted_dimensions)
+#print(isos)
+isos, dims = clean(dims)
+new_dims = print_clean(dims)
+for dim in new_dims:
+    print(dim)

+ 1 - 0
temporary/extracted_GV_12.csv

@@ -1,3 +1,4 @@
+Text;X;Y
 LAYOUT;420;297
 Concepts, principales and rules according to:;0;2
 Dimensions according to:;0;-1

+ 0 - 0
temporary/output.csv