vor 5 Jahren · df424fc1c8
--- a/clustering_precomputed_dbscan.py
+++ b/clustering_precomputed_dbscan.py
@@ -113,15 +113,12 @@ def dist(rectangle1, rectangle2):
 
				     return distance
			
 
				 
			
 
				 def clustering(dm,eps):
			
 
				-    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)  ##3.93 until now, bei 5 shon mehr erkannt, 7 noch mehr erkannt aber auch schon zu viel; GV12 ist 4.5 gut für LH zu wenig
			
 
				-    #db = OPTICS(min_samples=1,xi=0.1, metric="precomputed").fit(dm)
			
 
				+    db = DBSCAN(eps=eps, min_samples=1, metric="precomputed").fit(dm)                                                                                        ##3.93 until now, bei 5 shon mehr erkannt, 7 noch mehr erkannt aber auch schon zu viel; GV12 ist 4.5 gut für LH zu wenig
			
 
				     labels = db.labels_
			
 
				-    # Number of clusters in labels
			
 
				     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
			
 
				 
			
 
				     print('Estimated number of clusters: %d' % n_clusters_)
			
 
				-    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_corner_points.csv",
			
 
				-                           sep=";")
			
 
				+    data_df = pandas.read_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/list_to_csv_with_corner_points.csv", sep=";")
			
 
				     data_df["cluster"] = labels
			
 
				     data_df.groupby(['cluster', 'ausrichtung'])['element'].apply(','.join).reset_index().to_csv("/home/bscheibel/PycharmProjects/dxf_reader/temporary/values_clusteredfrom_precomputed_dbscan.csv",sep=";", header=False, index=False)
			
 
				     return data_df
			
--- a/iso_documents/ISO2768-1.txt
+++ b/iso_documents/ISO2768-1.txt
@@ -228,6 +228,11 @@ BEST BeuthStandardsCollection - Stand 2016-11
 
				                                                         c                 grob                                       ± 1°              ± 0° 30'           ±0° 15'              ±0° 10'
			
 
				 
			
 
				                                                         V              sehr grob                                     ±2°               ±1°                                     ±0° 20'
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				                                                 Seite 4 DIN ISO 2768 Teil 1
			
 
				 
			
 
				                                                 5 Zeichnungseintragungen                                      6 Zurückweisung
			
--- a/iso_documents/ISO8015.txt
+++ b/iso_documents/ISO8015.txt
--- a/main.py
+++ b/main.py
@@ -21,7 +21,7 @@ def main(uuid, filepath, db, eps):
 
				         if number_words > 500:
			
 
				             eps = 7
			
 
				         else:
			
 
				-            eps = 0.001
			
 
				+            eps = 1
			
 
				     #print(eps)
			
 
				     isos = order_bounding_boxes_in_each_block.extract_isos(result)
			
 
				     res = clustering_precomputed_dbscan.cluster_and_preprocess(result,eps)
			
--- a/old/dxf_line_reader.py
+++ b/old/dxf_line_reader.py
@@ -9,25 +9,25 @@ def printpoint(b):
 
				         pass
			
 
				 
			
 
				 buffer = ['0', 'fake']
			
 
				-filepath = 'GV_12.DXF'
			
 
				-with open(filepath,'r', errors="replace") as fp:
			
 
				+filepath = '../drawings/GV_12.DXF'
			
 
				+with open(filepath,'r') as fp:
			
 
				     line = fp.readline()
			
 
				     cnt = 1
			
 
				-    #while line:
			
 
				-        #line = fp.readline()
			
 
				+    while line:
			
 
				+        line = fp.readline()
			
 
				     #line = line.rstrip()
			
 
				-    print(line)
			
 
				-    if line == '0':  # we've started a new section, so
			
 
				-        print("Line {}: {}".format(cnt, line.strip()))
			
 
				-            #try:
			
 
				-            #    printpoint(buffer)  # handle the captured section
			
 
				-            #except:
			
 
				-            #    print("ERROR")
			
 
				+        print(line)
			
 
				+        if line == '0':  # we've started a new section, so
			
 
				+            print("Line {}: {}".format(cnt, line.strip()))
			
 
				+            try:
			
 
				+                printpoint(buffer)  # handle the captured section
			
 
				+            except:
			
 
				+                print("ERROR")
			
 
				 
			
 
				     #buffer = []  # and start a new one
			
 
				     #buffer.append(line)
			
 
				     cnt += 1
			
 
				-#f.close()
			
 
				+fp.close()
			
 
				 
			
 
				 #printpoint(buffer)        # buffer left over from last pass through loop
			
 
				 
			
--- a/old/dxf_reader.py
+++ b/old/dxf_reader.py
@@ -11,11 +11,11 @@ def printpoint(b):
 
				         print('{}'.format(obj['0']))
			
 
				 
			
 
				 
			
 
				-print('Code','Text')# header line
			
 
				+#print('Code','Text')# header line
			
 
				 buffer = ['0', 'fake']    # give first pass through loop something to process
			
 
				-for line in fileinput.input("GV_12.DXF", errors="replace"):
			
 
				+for line in fileinput.input("../drawings/GV_12.DXF"):
			
 
				     line = line.rstrip()
			
 
				-    print(line)
			
 
				+    #print(line)
			
 
				     if line == '0':         # we've started a new section, so
			
 
				         printpoint(buffer)      # handle the captured section
			
 
				         buffer = []             # and start a new one
			
--- a/old/ocr_test.py
+++ b/old/ocr_test.py
@@ -3,13 +3,14 @@ from tika import parser
 
				 from nltk.tokenize import word_tokenize
			
 
				 from nltk.corpus import stopwords
			
 
				 import nltk
			
 
				+nltk.download('stopwords')
			
 
				 
			
 
				 
			
 
				 #write a for-loop to open many files -- leave a comment if you'd #like to learn how
			
 
				-filename = "GV_12.pdf"
			
 
				+filename = "../drawings/GV_12.PDF"
			
 
				 #open allows you to read the file
			
 
				 pdfFileObj = open(filename,'rb')
			
 
				-#The pdfReader variable is a readable object that will be parsed
			
 
				+#The pdfReader variable is a reada2ble object that will be parsed
			
 
				 pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
			
 
				 #discerning the number of pages will allow us to parse through all #the pages
			
 
				 num_pages = pdfReader.numPages
			
@@ -25,11 +26,11 @@ if text != "":
 
				    text = text
			
 
				 #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
			
 
				 else:
			
 
				-    raw = parser.from_file("GV_12.pdf")
			
 
				+    raw = parser.from_file("../drawings/GV_12.PDF")
			
 
				     raw = str(raw)
			
 
				     safe_text = raw.encode('utf-8', errors='ignore')
			
 
				     text = str(safe_text).replace("\n", "").replace("\\", "")
			
 
				-    print(text)
			
 
				+    print(raw)
			
 
				 
			
 
				 #The word_tokenize() function will break our text phrases into #individual words
			
 
				 tokens = word_tokenize(text)
			
--- a/old/read_pdf.py
+++ b/old/read_pdf.py
@@ -13,4 +13,5 @@
 
				 
			
 
				 
			
 
				 import textract
			
 
				-text = textract.process("GV_12.pdf")
			
 
				+text = textract.process("../drawings/GV_12.PDF")
			
 
				+print(text)
			
--- a/old/read_text_lines_from_dxf.py
+++ b/old/read_text_lines_from_dxf.py
@@ -2,15 +2,10 @@ import csv
 
				 import math
			
 
				 
			
 
				 def printsection(buffer, file_out):
			
 
				-    #print(b)
			
 
				     obj = dict(zip(buffer[::2], buffer[1::2]))
			
 
				     for keys, values in obj.items():
			
 
				         if keys == '1':
			
 
				             try:
			
 
				-                #print(values)
			
 
				-                #print('{},{}'.format(obj['10'], obj['20']))
			
 
				-                #print("\n")
			
 
				-
			
 
				                 row = [values, math.floor(float(obj['10'])),math.floor(float(obj['20']))]
			
 
				                 with open(file_out, 'a') as csvFile:
			
 
				                     writer = csv.writer(csvFile, delimiter =';')
			
@@ -18,15 +13,9 @@ def printsection(buffer, file_out):
 
				                         writer.writerow(row)
			
 
				 
			
 
				                 csvFile.close()
			
 
				-
			
 
				-
			
 
				-
			
 
				             except:
			
 
				                 print("ERROR")
			
 
				-                #print(b)
			
 
				 
			
 
				-    #if obj.get('1'):
			
 
				-    #    print('{}'.format(obj['1']))
			
 
				 
			
 
				 def read(file, file_out):
			
 
				     buffer = []
			
@@ -34,8 +23,11 @@ def read(file, file_out):
 
				     for line in file:
			
 
				         line = line.strip()
			
 
				         #print(line)
			
 
				-        if line == '100':         # we've started a new section, so
			
 
				-            printsection(buffer, file_out)      # handle the captured section
			
 
				-            buffer = []             # and start a new one
			
 
				+        if line == '100':
			
 
				+            printsection(buffer, file_out)
			
 
				+            buffer = []
			
 
				         buffer.append(line)
			
 
				     printsection(buffer, file_out)
			
 
				+
			
 
				+
			
 
				+read("../drawings/sample.DXF", "sample.csv")
			
--- a/order_bounding_boxes_in_each_block.py
+++ b/order_bounding_boxes_in_each_block.py
@@ -62,23 +62,19 @@ def pdf_to_html(uuid,filepath):
 
				 
			
 
				 def extract_isos(result):
			
 
				     reg = r"(ISO\s\d\d\d\d*\W?\d?\W?\d?)|(EN\s\d*)"
			
 
				-    #reg1 = r""
			
 
				-    #reg2 = r""
			
 
				     details_ = []
			
 
				     for element in result:
			
 
				         new_arr = ""
			
 
				-        print(element)
			
 
				         for x in element:
			
 
				             new_arr += x[4] + " "
			
 
				-        #print(new_arr)
			
 
				         if re.search(reg,new_arr):
			
 
				-            #print(new_arr)
			
 
				             found = re.findall(reg, new_arr)
			
 
				             for f in found:
			
 
				                 if len(f[0]) != 0:
			
 
				                     details_.append(f[0].replace(")",""))
			
 
				                 if len(f[1]) != 0:
			
 
				                     details_.append(f[1])
			
 
				+
			
 
				     return details_
			
 
				 
			
 
				 
			
--- a/organize_drawing_according_to_details_new.py
+++ b/organize_drawing_according_to_details_new.py
@@ -140,7 +140,7 @@ def main_function(result, tables):
 
				     for table in tables:
			
 
				         table[3] = 10000000
			
 
				         coord = []
			
 
				-        name = "Table"
			
 
				+        name = "ZZZZZTable"
			
 
				         for tab in table[:4]:
			
 
				             coord.append(tab)
			
 
				         details_dict[name] = coord
			
--- a/read_iso_tables/pdf_table_extractor.py
+++ b/read_iso_tables/pdf_table_extractor.py
@@ -1,7 +1,8 @@
 
				 import camelot
			
 
				 import matplotlib.pyplot as plt
			
 
				-tables = camelot.read_pdf('/Users/beatescheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF', pages="3",line_scale=70, line_tol=2, joint_tol=35)
			
 
				+tables = camelot.read_pdf('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF', pages="3",line_scale=70, line_tol=2, joint_tol=35)
			
 
				 tables.export('foo.csv', f='csv')
			
 
				 print(tables[0].df)
			
 
				 camelot.plot(tables[0], kind='grid')
			
 
				-plt.show()
			
 
				+plt.show()
			
 
				+print(tables[3])
			
--- a/read_iso_tables/read_isos.py
+++ b/read_iso_tables/read_isos.py
@@ -1,10 +1,10 @@
 
				 import nltk
			
 
				 import re
			
 
				 from tika import parser
			
 
				-#einleitung = False
			
 
				+einleitung = False
			
 
				+raw = parser.from_file('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF')
			
 
				 #raw = parser.from_file('iso_documents/ISO286-2.PDF')
			
 
				-#raw = parser.from_file('iso_documents/ISO286-2.PDF')
			
 
				-#print(raw['content'])
			
 
				+print(raw['content'])
			
 
				 #text = raw['content']
			
 
				 #sent_text = nltk.sent_tokenize(text)
			
 
				 #tokenized_text = nltk.word_tokenize(sent_text.split)
			
@@ -19,7 +19,7 @@ from tika import parser
 
				 
			
 
				 import subprocess
			
 
				 #subprocess.check_output(['ls','-l']) #all that is technically needed...
			
 
				-cmd = 'pdftotext -layout "home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
			
 
				+cmd = 'pdftotext -layout "/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
			
 
				 print(subprocess.Popen(cmd, shell=True))
			
 
				 
			
 
				 #convert iso document to text
			
--- a/read_iso_tables/read_tables.py
+++ b/read_iso_tables/read_tables.py
@@ -23,10 +23,11 @@ def file_read(fname):
 
				 
			
 
				 
			
 
				 #file_read('drawings/5129275_Rev01-GV12.txt')
			
 
				-tables = camelot.read_pdf("iso_documents/ISO2768-1.PDF", pages="3")
			
 
				+tables = camelot.read_pdf("/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF", pages="3")
			
 
				 tables.export('output_mit_camelot.csv', f='csv')
			
 
				 
			
 
				-output = subprocess.check_output(["less","iso_documents/ISO2768-1.PDF"])
			
 
				+output = subprocess.check_output(["less","/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF"])
			
 
				+print(output)
			
 
				 
			
 
				 re_data_prefix = re.compile("^[0-9]+[.].*$")
			
 
				 re_data_fields = re.compile("(([^ ]+[ ]?)+)")
			
--- a/temporary/list_to_csv_with_corner_points.csv
+++ b/temporary/list_to_csv_with_corner_points.csv
--- a/temporary/values_clusteredfrom_precomputed_dbscan.csv
+++ b/temporary/values_clusteredfrom_precomputed_dbscan.csv