Browse Source

test commit

bscheibel 4 years ago
parent
commit
df424fc1c8

+ 2 - 5
clustering_precomputed_dbscan.py

@@ -113,15 +113,12 @@ def dist(rectangle1, rectangle2):
     return distance
 
 def clustering(dm,eps):
-    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)  ##3.93 until now, bei 5 shon mehr erkannt, 7 noch mehr erkannt aber auch schon zu viel; GV12 ist 4.5 gut für LH zu wenig
-    #db = OPTICS(min_samples=1,xi=0.1, metric="precomputed").fit(dm)
+    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)                                                                                        ##3.93 until now, bei 5 shon mehr erkannt, 7 noch mehr erkannt aber auch schon zu viel; GV12 ist 4.5 gut für LH zu wenig
     labels = db.labels_
-    # Number of clusters in labels
     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 
     print('Estimated number of clusters: %d' % n_clusters_)
-    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_corner_points.csv",
-                           sep=";")
+    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_corner_points.csv", sep=";")
     data_df["cluster"] = labels
     data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index().to_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
     return data_df

+ 5 - 0
iso_documents/ISO2768-1.txt

@@ -228,6 +228,11 @@ BEST BeuthStandardsCollection - Stand 2016-11
                                                         c                 grob                                       ± 1°              ± 0° 30'           ±0° 15'              ±0° 10'
 
                                                         V              sehr grob                                     ±2°               ±1°                                     ±0° 20'
+
+
+
+
+
                                                 Seite 4 DIN ISO 2768 Teil 1
 
                                                 5 Zeichnungseintragungen                                      6 Zurückweisung

File diff suppressed because it is too large
+ 1078 - 762
iso_documents/ISO8015.txt


+ 1 - 1
main.py

@@ -21,7 +21,7 @@ def main(uuid, filepath, db, eps):
         if number_words > 500:
             eps = 7
         else:
-            eps = 0.001
+            eps = 1
     #print(eps)
     isos = order_bounding_boxes_in_each_block.extract_isos(result)
     res = clustering_precomputed_dbscan.cluster_and_preprocess(result,eps)

+ 12 - 12
old/dxf_line_reader.py

@@ -9,25 +9,25 @@ def printpoint(b):
         pass
 
 buffer = ['0', 'fake']
-filepath = 'GV_12.DXF'
-with open(filepath,'r', errors="replace") as fp:
+filepath = '../drawings/GV_12.DXF'
+with open(filepath,'r') as fp:
     line = fp.readline()
     cnt = 1
-    #while line:
-        #line = fp.readline()
+    while line:
+        line = fp.readline()
     #line = line.rstrip()
-    print(line)
-    if line == '0':  # we've started a new section, so
-        print("Line {}: {}".format(cnt, line.strip()))
-            #try:
-            #    printpoint(buffer)  # handle the captured section
-            #except:
-            #    print("ERROR")
+        print(line)
+        if line == '0':  # we've started a new section, so
+            print("Line {}: {}".format(cnt, line.strip()))
+            try:
+                printpoint(buffer)  # handle the captured section
+            except:
+                print("ERROR")
 
     #buffer = []  # and start a new one
     #buffer.append(line)
     cnt += 1
-#f.close()
+fp.close()
 
 #printpoint(buffer)        # buffer left over from last pass through loop
 

+ 3 - 3
old/dxf_reader.py

@@ -11,11 +11,11 @@ def printpoint(b):
         print('{}'.format(obj['0']))
 
 
-print('Code','Text')# header line
+#print('Code','Text')# header line
 buffer = ['0', 'fake']    # give first pass through loop something to process
-for line in fileinput.input("GV_12.DXF", errors="replace"):
+for line in fileinput.input("../drawings/GV_12.DXF"):
     line = line.rstrip()
-    print(line)
+    #print(line)
     if line == '0':         # we've started a new section, so
         printpoint(buffer)      # handle the captured section
         buffer = []             # and start a new one

+ 5 - 4
old/ocr_test.py

@@ -3,13 +3,14 @@ from tika import parser
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 import nltk
+nltk.download('stopwords')
 
 
 #write a for-loop to open many files -- leave a comment if you'd #like to learn how
-filename = "GV_12.pdf"
+filename = "../drawings/GV_12.PDF"
 #open allows you to read the file
 pdfFileObj = open(filename,'rb')
-#The pdfReader variable is a readable object that will be parsed
+#The pdfReader variable is a reada2ble object that will be parsed
 pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
 #discerning the number of pages will allow us to parse through all #the pages
 num_pages = pdfReader.numPages
@@ -25,11 +26,11 @@ if text != "":
    text = text
 #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
 else:
-    raw = parser.from_file("GV_12.pdf")
+    raw = parser.from_file("../drawings/GV_12.PDF")
     raw = str(raw)
     safe_text = raw.encode('utf-8', errors='ignore')
     text = str(safe_text).replace("\n", "").replace("\\", "")
-    print(text)
+    print(raw)
 
 #The word_tokenize() function will break our text phrases into #individual words
 tokens = word_tokenize(text)

+ 2 - 1
old/read_pdf.py

@@ -13,4 +13,5 @@
 
 
 import textract
-text = textract.process("GV_12.pdf")
+text = textract.process("../drawings/GV_12.PDF")
+print(text)

+ 6 - 14
old/read_text_lines_from_dxf.py

@@ -2,15 +2,10 @@ import csv
 import math
 
 def printsection(buffer, file_out):
-    #print(b)
     obj = dict(zip(buffer[::2], buffer[1::2]))
     for keys, values in obj.items():
         if keys == '1':
             try:
-                #print(values)
-                #print('{},{}'.format(obj['10'], obj['20']))
-                #print("\n")
-
                 row = [values, math.floor(float(obj['10'])),math.floor(float(obj['20']))]
                 with open(file_out, 'a') as csvFile:
                     writer = csv.writer(csvFile, delimiter =';')
@@ -18,15 +13,9 @@ def printsection(buffer, file_out):
                         writer.writerow(row)
 
                 csvFile.close()
-
-
-
             except:
                 print("ERROR")
-                #print(b)
 
-    #if obj.get('1'):
-    #    print('{}'.format(obj['1']))
 
 def read(file, file_out):
     buffer = []
@@ -34,8 +23,11 @@ def read(file, file_out):
     for line in file:
         line = line.strip()
         #print(line)
-        if line == '100':         # we've started a new section, so
-            printsection(buffer, file_out)      # handle the captured section
-            buffer = []             # and start a new one
+        if line == '100':
+            printsection(buffer, file_out)
+            buffer = []
         buffer.append(line)
     printsection(buffer, file_out)
+
+
+read("../drawings/sample.DXF", "sample.csv")

+ 1 - 5
order_bounding_boxes_in_each_block.py

@@ -62,23 +62,19 @@ def pdf_to_html(uuid,filepath):
 
 def extract_isos(result):
     reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
-    #reg1 = r""
-    #reg2 = r""
     details_ = []
     for element in result:
         new_arr = ""
-        print(element)
         for x in element:
             new_arr += x[4] + " "
-        #print(new_arr)
         if re.search(reg,new_arr):
-            #print(new_arr)
             found = re.findall(reg, new_arr)
             for f in found:
                 if len(f[0]) != 0:
                     details_.append(f[0].replace(")",""))
                 if len(f[1]) != 0:
                     details_.append(f[1])
+
     return details_
 
 

+ 1 - 1
organize_drawing_according_to_details_new.py

@@ -140,7 +140,7 @@ def main_function(result, tables):
     for table in tables:
         table[3] = 10000000
         coord = []
-        name = "Table"
+        name = "ZZZZZTable"
         for tab in table[:4]:
             coord.append(tab)
         details_dict[name] = coord

+ 3 - 2
read_iso_tables/pdf_table_extractor.py

@@ -1,7 +1,8 @@
 import camelot
 import matplotlib.pyplot as plt
-tables = camelot.read_pdf('/Users/beatescheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF', pages="3",line_scale=70, line_tol=2, joint_tol=35)
+tables = camelot.read_pdf('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF', pages="3",line_scale=70, line_tol=2, joint_tol=35)
 tables.export('foo.csv', f='csv')
 print(tables[0].df)
 camelot.plot(tables[0], kind='grid')
-plt.show()
+plt.show()
+print(tables[3])

+ 4 - 4
read_iso_tables/read_isos.py

@@ -1,10 +1,10 @@
 import nltk
 import re
 from tika import parser
-#einleitung = False
+einleitung = False
+raw = parser.from_file('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF')
 #raw = parser.from_file('iso_documents/ISO286-2.PDF')
-#raw = parser.from_file('iso_documents/ISO286-2.PDF')
-#print(raw['content'])
+print(raw['content'])
 #text = raw['content']
 #sent_text = nltk.sent_tokenize(text)
 #tokenized_text = nltk.word_tokenize(sent_text.split)
@@ -19,7 +19,7 @@ from tika import parser
 
 import subprocess
 #subprocess.check_output(['ls','-l']) #all that is technically needed...
-cmd = 'pdftotext -layout "home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
+cmd = 'pdftotext -layout "/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
 print(subprocess.Popen(cmd, shell=True))
 
 #convert iso document to text

+ 3 - 2
read_iso_tables/read_tables.py

@@ -23,10 +23,11 @@ def file_read(fname):
 
 
 #file_read('drawings/5129275_Rev01-GV12.txt')
-tables = camelot.read_pdf("iso_documents/ISO2768-1.PDF", pages="3")
+tables = camelot.read_pdf("/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF", pages="3")
 tables.export('output_mit_camelot.csv', f='csv')
 
-output = subprocess.check_output(["less","iso_documents/ISO2768-1.PDF"])
+output = subprocess.check_output(["less","/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF"])
+print(output)
 
 re_data_prefix = re.compile("^[0-9]+[.].*$")
 re_data_fields = re.compile("(([^ ]+[ ]?)+)")

File diff suppressed because it is too large
+ 126 - 330
temporary/list_to_csv_with_corner_points.csv


File diff suppressed because it is too large
+ 125 - 255
temporary/values_clusteredfrom_precomputed_dbscan.csv