6 年之前 · df424fc1c8
--- a/clustering_precomputed_dbscan.py
+++ b/clustering_precomputed_dbscan.py
@@ -113,15 +113,12 @@ def dist(rectangle1, rectangle2):
 
																     return distance
															
 
																 def clustering(dm,eps):
															
 
																-    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)  ##3.93 until now, bei 5 shon mehr erkannt, 7 noch mehr erkannt aber auch schon zu viel; GV12 ist 4.5 gut für LH zu wenig
															
 
																-    #db = OPTICS(min_samples=1,xi=0.1, metric="precomputed").fit(dm)
															
 
																+    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)                                                                                        ##3.93 until now, bei 5 shon mehr erkannt, 7 noch mehr erkannt aber auch schon zu viel; GV12 ist 4.5 gut für LH zu wenig
															
 
																     labels = db.labels_
															
 
																-    # Number of clusters in labels
															
 
																     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
															
 
																     print('Estimated number of clusters: %d' % n_clusters_)
															
 
																-    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_corner_points.csv",
															
 
																-                           sep=";")
															
 
																+    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_corner_points.csv", sep=";")
															
 
																     data_df["cluster"] = labels
															
 
																     data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index().to_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
															
 
																     return data_df
															
--- a/iso_documents/ISO2768-1.txt
+++ b/iso_documents/ISO2768-1.txt
@@ -228,6 +228,11 @@ BEST BeuthStandardsCollection - Stand 2016-11
 
																                                                         c                 grob                                       ± 1°              ± 0° 30'           ±0° 15'              ±0° 10'
															
 
																                                                         V              sehr grob                                     ±2°               ±1°                                     ±0° 20'
															
 
																+
															
 
																+
															
 
																+
															
 
																+
															
 
																+
															
 
																                                                 Seite 4 DIN ISO 2768 Teil 1
															
 
																                                                 5 Zeichnungseintragungen                                      6 Zurückweisung
															
--- a/iso_documents/ISO8015.txt
+++ b/iso_documents/ISO8015.txt
--- a/main.py
+++ b/main.py
@@ -21,7 +21,7 @@ def main(uuid, filepath, db, eps):
 
																         if number_words > 500:
															
 
																             eps = 7
															
 
																         else:
															
 
																-            eps = 0.001
															
 
																+            eps = 1
															
 
																     #print(eps)
															
 
																     isos = order_bounding_boxes_in_each_block.extract_isos(result)
															
 
																     res = clustering_precomputed_dbscan.cluster_and_preprocess(result,eps)
															
--- a/old/dxf_line_reader.py
+++ b/old/dxf_line_reader.py
@@ -9,25 +9,25 @@ def printpoint(b):
 
																         pass
															
 
																 buffer = ['0', 'fake']
															
 
																-filepath = 'GV_12.DXF'
															
 
																-with open(filepath,'r', errors="replace") as fp:
															
 
																+filepath = '../drawings/GV_12.DXF'
															
 
																+with open(filepath,'r') as fp:
															
 
																     line = fp.readline()
															
 
																     cnt = 1
															
 
																-    #while line:
															
 
																-        #line = fp.readline()
															
 
																+    while line:
															
 
																+        line = fp.readline()
															
 
																     #line = line.rstrip()
															
 
																-    print(line)
															
 
																-    if line == '0':  # we've started a new section, so
															
 
																-        print("Line {}: {}".format(cnt, line.strip()))
															
 
																-            #try:
															
 
																-            #    printpoint(buffer)  # handle the captured section
															
 
																-            #except:
															
 
																-            #    print("ERROR")
															
 
																+        print(line)
															
 
																+        if line == '0':  # we've started a new section, so
															
 
																+            print("Line {}: {}".format(cnt, line.strip()))
															
 
																+            try:
															
 
																+                printpoint(buffer)  # handle the captured section
															
 
																+            except:
															
 
																+                print("ERROR")
															
 
																     #buffer = []  # and start a new one
															
 
																     #buffer.append(line)
															
 
																     cnt += 1
															
 
																-#f.close()
															
 
																+fp.close()
															
 
																 #printpoint(buffer)        # buffer left over from last pass through loop
															
--- a/old/dxf_reader.py
+++ b/old/dxf_reader.py
@@ -11,11 +11,11 @@ def printpoint(b):
 
																         print('{}'.format(obj['0']))
															
 
																-print('Code','Text')# header line
															
 
																+#print('Code','Text')# header line
															
 
																 buffer = ['0', 'fake']    # give first pass through loop something to process
															
 
																-for line in fileinput.input("GV_12.DXF", errors="replace"):
															
 
																+for line in fileinput.input("../drawings/GV_12.DXF"):
															
 
																     line = line.rstrip()
															
 
																-    print(line)
															
 
																+    #print(line)
															
 
																     if line == '0':         # we've started a new section, so
															
 
																         printpoint(buffer)      # handle the captured section
															
 
																         buffer = []             # and start a new one
															
--- a/old/ocr_test.py
+++ b/old/ocr_test.py
@@ -3,13 +3,14 @@ from tika import parser
 
																 from nltk.tokenize import word_tokenize
															
 
																 from nltk.corpus import stopwords
															
 
																 import nltk
															
 
																+nltk.download('stopwords')
															
 
																 #write a for-loop to open many files -- leave a comment if you'd #like to learn how
															
 
																-filename = "GV_12.pdf"
															
 
																+filename = "../drawings/GV_12.PDF"
															
 
																 #open allows you to read the file
															
 
																 pdfFileObj = open(filename,'rb')
															
 
																-#The pdfReader variable is a readable object that will be parsed
															
 
																+#The pdfReader variable is a reada2ble object that will be parsed
															
 
																 pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
															
 
																 #discerning the number of pages will allow us to parse through all #the pages
															
 
																 num_pages = pdfReader.numPages
															
@@ -25,11 +26,11 @@ if text != "":
 
																    text = text
															
 
																 #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
															
 
																 else:
															
 
																-    raw = parser.from_file("GV_12.pdf")
															
 
																+    raw = parser.from_file("../drawings/GV_12.PDF")
															
 
																     raw = str(raw)
															
 
																     safe_text = raw.encode('utf-8', errors='ignore')
															
 
																     text = str(safe_text).replace("\n", "").replace("\\", "")
															
 
																-    print(text)
															
 
																+    print(raw)
															
 
																 #The word_tokenize() function will break our text phrases into #individual words
															
 
																 tokens = word_tokenize(text)
															
--- a/old/read_pdf.py
+++ b/old/read_pdf.py
@@ -13,4 +13,5 @@
 
																 import textract
															
 
																-text = textract.process("GV_12.pdf")
															
 
																+text = textract.process("../drawings/GV_12.PDF")
															
 
																+print(text)
															
--- a/old/read_text_lines_from_dxf.py
+++ b/old/read_text_lines_from_dxf.py
@@ -2,15 +2,10 @@ import csv
 
																 import math
															
 
																 def printsection(buffer, file_out):
															
 
																-    #print(b)
															
 
																     obj = dict(zip(buffer[::2], buffer[1::2]))
															
 
																     for keys, values in obj.items():
															
 
																         if keys == '1':
															
 
																             try:
															
 
																-                #print(values)
															
 
																-                #print('{},{}'.format(obj['10'], obj['20']))
															
 
																-                #print("\n")
															
 
																-
															
 
																                 row = [values, math.floor(float(obj['10'])),math.floor(float(obj['20']))]
															
 
																                 with open(file_out, 'a') as csvFile:
															
 
																                     writer = csv.writer(csvFile, delimiter =';')
															
@@ -18,15 +13,9 @@ def printsection(buffer, file_out):
 
																                         writer.writerow(row)
															
 
																                 csvFile.close()
															
 
																-
															
 
																-
															
 
																-
															
 
																             except:
															
 
																                 print("ERROR")
															
 
																-                #print(b)
															
 
																-    #if obj.get('1'):
															
 
																-    #    print('{}'.format(obj['1']))
															
 
																 def read(file, file_out):
															
 
																     buffer = []
															
@@ -34,8 +23,11 @@ def read(file, file_out):
 
																     for line in file:
															
 
																         line = line.strip()
															
 
																         #print(line)
															
 
																-        if line == '100':         # we've started a new section, so
															
 
																-            printsection(buffer, file_out)      # handle the captured section
															
 
																-            buffer = []             # and start a new one
															
 
																+        if line == '100':
															
 
																+            printsection(buffer, file_out)
															
 
																+            buffer = []
															
 
																         buffer.append(line)
															
 
																     printsection(buffer, file_out)
															
 
																+
															
 
																+
															
 
																+read("../drawings/sample.DXF", "sample.csv")
															
--- a/order_bounding_boxes_in_each_block.py
+++ b/order_bounding_boxes_in_each_block.py
@@ -62,23 +62,19 @@ def pdf_to_html(uuid,filepath):
 
																 def extract_isos(result):
															
 
																     reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
															
 
																-    #reg1 = r""
															
 
																-    #reg2 = r""
															
 
																     details_ = []
															
 
																     for element in result:
															
 
																         new_arr = ""
															
 
																-        print(element)
															
 
																         for x in element:
															
 
																             new_arr += x[4] + " "
															
 
																-        #print(new_arr)
															
 
																         if re.search(reg,new_arr):
															
 
																-            #print(new_arr)
															
 
																             found = re.findall(reg, new_arr)
															
 
																             for f in found:
															
 
																                 if len(f[0]) != 0:
															
 
																                     details_.append(f[0].replace(")",""))
															
 
																                 if len(f[1]) != 0:
															
 
																                     details_.append(f[1])
															
 
																+
															
 
																     return details_
															
--- a/organize_drawing_according_to_details_new.py
+++ b/organize_drawing_according_to_details_new.py
@@ -140,7 +140,7 @@ def main_function(result, tables):
 
																     for table in tables:
															
 
																         table[3] = 10000000
															
 
																         coord = []
															
 
																-        name = "Table"
															
 
																+        name = "ZZZZZTable"
															
 
																         for tab in table[:4]:
															
 
																             coord.append(tab)
															
 
																         details_dict[name] = coord
															
--- a/read_iso_tables/pdf_table_extractor.py
+++ b/read_iso_tables/pdf_table_extractor.py
@@ -1,7 +1,8 @@
 
																 import camelot
															
 
																 import matplotlib.pyplot as plt
															
 
																-tables = camelot.read_pdf('/Users/beatescheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF', pages="3",line_scale=70, line_tol=2, joint_tol=35)
															
 
																+tables = camelot.read_pdf('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF', pages="3",line_scale=70, line_tol=2, joint_tol=35)
															
 
																 tables.export('foo.csv', f='csv')
															
 
																 print(tables[0].df)
															
 
																 camelot.plot(tables[0], kind='grid')
															
 
																-plt.show()
															
 
																+plt.show()
															
 
																+print(tables[3])
															
--- a/read_iso_tables/read_isos.py
+++ b/read_iso_tables/read_isos.py
@@ -1,10 +1,10 @@
 
																 import nltk
															
 
																 import re
															
 
																 from tika import parser
															
 
																-#einleitung = False
															
 
																+einleitung = False
															
 
																+raw = parser.from_file('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF')
															
 
																 #raw = parser.from_file('iso_documents/ISO286-2.PDF')
															
 
																-#raw = parser.from_file('iso_documents/ISO286-2.PDF')
															
 
																-#print(raw['content'])
															
 
																+print(raw['content'])
															
 
																 #text = raw['content']
															
 
																 #sent_text = nltk.sent_tokenize(text)
															
 
																 #tokenized_text = nltk.word_tokenize(sent_text.split)
															
@@ -19,7 +19,7 @@ from tika import parser
 
																 import subprocess
															
 
																 #subprocess.check_output(['ls','-l']) #all that is technically needed...
															
 
																-cmd = 'pdftotext -layout "home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
															
 
																+cmd = 'pdftotext -layout "/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
															
 
																 print(subprocess.Popen(cmd, shell=True))
															
 
																 #convert iso document to text
															
--- a/read_iso_tables/read_tables.py
+++ b/read_iso_tables/read_tables.py
@@ -23,10 +23,11 @@ def file_read(fname):
 
																 #file_read('drawings/5129275_Rev01-GV12.txt')
															
 
																-tables = camelot.read_pdf("iso_documents/ISO2768-1.PDF", pages="3")
															
 
																+tables = camelot.read_pdf("/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF", pages="3")
															
 
																 tables.export('output_mit_camelot.csv', f='csv')
															
 
																-output = subprocess.check_output(["less","iso_documents/ISO2768-1.PDF"])
															
 
																+output = subprocess.check_output(["less","/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF"])
															
 
																+print(output)
															
 
																 re_data_prefix = re.compile("^[0-9]+[.].*$")
															
 
																 re_data_fields = re.compile("(([^ ]+[ ]?)+)")
															
--- a/temporary/list_to_csv_with_corner_points.csv
+++ b/temporary/list_to_csv_with_corner_points.csv
--- a/temporary/values_clusteredfrom_precomputed_dbscan.csv
+++ b/temporary/values_clusteredfrom_precomputed_dbscan.csv