bscheibel 5 years ago
parent
commit
e65fa64cda
12 changed files with 79 additions and 33410 deletions
  1. 2 2
      .idea/encodings.xml
  2. 0 13912
      GV_12.DXF
  3. BIN
      GV_12.PDF
  4. 0 19262
      Stahl_Adapterplatte.DXF
  5. 0 34
      dxf_line_reader.py
  6. 0 24
      dxf_reader.py
  7. 0 43
      merge_lines.py
  8. 6 4
      merge_pandas.py
  9. 0 42
      ocr_test.py
  10. 57 57
      read_data.py
  11. 0 16
      read_pdf.py
  12. 14 14
      read_text_lines.py

+ 2 - 2
.idea/encodings.xml

@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="Encoding">
-    <file url="file://$PROJECT_DIR$/GV_12.DXF" charset="US-ASCII" />
-    <file url="file://$PROJECT_DIR$/Stahl_Adapterplatte.DXF" charset="windows-1252" />
+    <file url="file://$PROJECT_DIR$/drawings/GV_12.DXF" charset="US-ASCII" />
+    <file url="file://$PROJECT_DIR$/drawings/Stahl_Adapterplatte.DXF" charset="windows-1252" />
   </component>
 </project>

File diff suppressed because it is too large
+ 0 - 13912
GV_12.DXF


BIN
GV_12.PDF


File diff suppressed because it is too large
+ 0 - 19262
Stahl_Adapterplatte.DXF


+ 0 - 34
dxf_line_reader.py

@@ -1,34 +0,0 @@
-
-def printpoint(b):
-    print(b)
-    obj = dict(zip(b[::2], b[1::2]))
-    try:
-        if obj['100'] == 'AcDbMText':
-            print('{}'.format(obj['0']))
-    except:
-        pass
-
-buffer = ['0', 'fake']
-filepath = 'GV_12.DXF'
-with open(filepath,'r', errors="replace") as fp:
-    line = fp.readline()
-    cnt = 1
-    #while line:
-        #line = fp.readline()
-    #line = line.rstrip()
-    print(line)
-    if line == '0':  # we've started a new section, so
-        print("Line {}: {}".format(cnt, line.strip()))
-            #try:
-            #    printpoint(buffer)  # handle the captured section
-            #except:
-            #    print("ERROR")
-
-    #buffer = []  # and start a new one
-    #buffer.append(line)
-    cnt += 1
-#f.close()
-
-#printpoint(buffer)        # buffer left over from last pass through loop
-
-#https://leancrew.com/all-this/2016/12/dxf-data-extraction/

+ 0 - 24
dxf_reader.py

@@ -1,24 +0,0 @@
-#!/usr/bin/env python
-
-#from fileinput \
-import fileinput
-
-
-def printpoint(b):
-    print(b)
-    obj = dict(zip(b[::2], b[1::2]))
-    if obj['0'] == 'AcDbMText':
-        print('{}'.format(obj['0']))
-
-
-print('Code','Text')# header line
-buffer = ['0', 'fake']    # give first pass through loop something to process
-for line in fileinput.input("GV_12.DXF", errors="replace"):
-    line = line.rstrip()
-    print(line)
-    if line == '0':         # we've started a new section, so
-        printpoint(buffer)      # handle the captured section
-        buffer = []             # and start a new one
-    buffer.append(line)
-
-printpoint(buffer)        # buffer left over from last pass through loop

+ 0 - 43
merge_lines.py

@@ -1,43 +0,0 @@
-import csv
-
-## open CSV file and rea it
-myfile  = open('text.csv', "r")
-reader = csv.reader(myfile, delimiter=";")
-## create an empty dictionary
-mydictionary = {}
-
-rownum = 0
-
-for row in reader:
-    ## check if it is the header
-    if rownum == 0:
-        pass
-    else:
-        ## split the line of CSV in elements..Use the name for the key in dictionary and the other two in a list
-        #line = row.split(";")
-        #print(row)
-        text = row[0]
-        #print(text)
-        x = row[1]
-        y = row[2]
-
-        if x in mydictionary:
-            mydictionary[text][1] += text
-            print(mydictionary[text][1] )
-        else:
-            mydictionary[text] = [x,y]
-
-    rownum += 1
-
-myfile.close()
-
-## create a new list of lists with the data from the dictionary
-newcsvfile = ["text","x","y"]
-
-for i in mydictionary:
-    newcsvfile.append(mydictionary[i])
-
-## write the new list of lists in a new CSV file
-with open("output.csv", "wb") as f:
-    writer = csv.writer(f)
-    writer.writerows(newcsvfile)

+ 6 - 4
merge_pandas.py

@@ -1,5 +1,7 @@
 import pandas
-df = pandas.read_csv('text.csv', header = 0, delimiter=";")
-df['Text'] = df.groupby(['X','Y'])['TEXT'].transform('sum')
-df.drop_duplicates()
-df.to_csv("merged.csv")
+
+def merge_lines(file_out):
+    df = pandas.read_csv(file_out, header = 0, delimiter=";")
+    df['Text'] = df.groupby(['X','Y'])['TEXT'].transform('sum')
+    df.drop_duplicates()
+    df.to_csv("file_out.csv")

+ 0 - 42
ocr_test.py

@@ -1,42 +0,0 @@
-import PyPDF2
-from tika import parser
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-import nltk
-
-
-#write a for-loop to open many files -- leave a comment if you'd #like to learn how
-filename = "GV_12.pdf"
-#open allows you to read the file
-pdfFileObj = open(filename,'rb')
-#The pdfReader variable is a readable object that will be parsed
-pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
-#discerning the number of pages will allow us to parse through all #the pages
-num_pages = pdfReader.numPages
-count = 0
-text = ""
-#The while loop will read each page
-while count < num_pages:
-    pageObj = pdfReader.getPage(count)
-    count +=1
-    text += pageObj.extractText()
-#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
-if text != "":
-   text = text
-#If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
-else:
-    raw = parser.from_file("GV_12.pdf")
-    raw = str(raw)
-    safe_text = raw.encode('utf-8', errors='ignore')
-    text = str(safe_text).replace("\n", "").replace("\\", "")
-    print(text)
-
-#The word_tokenize() function will break our text phrases into #individual words
-tokens = word_tokenize(text)
-#we'll create a new list which contains punctuation we wish to clean
-punctuations = ['(',')',';',':','[',']',',']
-#We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
-stop_words = stopwords.words('english')
-#We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
-keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
-print(keywords)

+ 57 - 57
read_data.py

@@ -1,66 +1,66 @@
 import csv
 import re
 
-with open('merged.csv') as csv_file:
-    csv_reader = csv.reader(csv_file, delimiter=',')
-    line_count = 0
-    durchmesser = False
-    vorzeichen = "nix"
-    text = True
-    isos = []
-    dimensions = []
-    for row in csv_reader:
-        line_count += 1
 
-        if "ISO" in row[1]:
-            isos.append(row[1])
-        if durchmesser:
-            #print("Durchmesser: " + row[1])
-            dimensions.append("Durchmesser: " + row[1])
-            durchmesser = False
-            continue
-        if row[1] == "%%c":
-            durchmesser = True
-        if row[1] == "-" or row[1] == "+":
-            vorzeichen = row[1]
-        isnumber = re.findall(r"\d*\,\d+", row[1])
-        if isnumber:
-            if vorzeichen != "nix":
-                #print(vorzeichen + isnumber[0])
-                dimensions.append(vorzeichen + isnumber[0])
-            else:
-                if row[1][0]!="?":
-                    #print(isnumber[0])
-                    dimensions.append(isnumber[0])
-            vorzeichen = "nix"
-        if row[1][0] == "?":
-            #print("+/- " + row[1][1:])
-            dimensions.append("+/- " + row[1][1:])
+def read_dimensions(file_out, num):
+    with open(file_out) as csv_file:
+        csv_reader = csv.reader(csv_file, delimiter=',')
+        line_count = 0
+        durchmesser = False
+        vorzeichen = "nix"
+        isos = []
+        dimensions = []
+        for row in csv_reader:
+            line_count += 1
 
-    print(isos)
-    #print(dimensions)
-    print(f'Processed {line_count} lines.')
+            if "ISO" in row[num]:
+                isos.append(row[num])
+            if durchmesser:
+                #print("Durchmesser: " + row[1])
+                dimensions.append("Durchmesser: " + row[num])
+                durchmesser = False
+                continue
+            if row[num] == "%%c":
+                durchmesser = True
+            if row[num] == "-" or row[num] == "+":
+                vorzeichen = row[num]
+            isnumber = re.findall(r"\d*\,\d+", row[num])
+            if isnumber:
+                if vorzeichen != "nix":
+                    #print(vorzeichen + isnumber[0])
+                    dimensions.append(vorzeichen + isnumber[0])
+                else:
+                    if row[num][0]!="?":
+                        #print(isnumber[0])
+                        dimensions.append(isnumber[0])
+                vorzeichen = "nix"
+            if row[num][0] == "?":
+                #print("+/- " + row[1][1:])
+                dimensions.append("+/- " + row[num][1:])
 
-    dim = []
-    dim_count = 0
-    for x in dimensions:
-        if x == "Durchmesser: ":
-            dim_count = 0
-        if dim_count > 2:
-            dim_count = 0
-        if dim_count == 0:
-            print("Maße: " + "\n" + x)
-            dim_count += 1
-            continue
-        if dim_count == 1:
-            print ("Toleranzen: " + "\n" + x)
-            dim_count += 1
-            if "+/-" in x:
+        print(isos)
+        print(f'Processed {line_count} lines.')
+
+        dim = []
+        dim_count = 0
+        for x in dimensions:
+            if x == "Durchmesser: ":
+                dim_count = 0
+            if dim_count > 2:
+                dim_count = 0
+            if dim_count == 0:
+                print("Maße: " + "\n" + x)
+                dim_count += 1
+                continue
+            if dim_count == 1:
+                print ("Toleranzen: " + "\n" + x)
+                dim_count += 1
+                if "+/-" in x:
+                    dim_count += 1
+                continue
+            if dim_count == 2:
+                print(x)
                 dim_count += 1
-            continue
-        if dim_count == 2:
-            print(x)
-            dim_count += 1
-            continue
+                continue
 
 

+ 0 - 16
read_pdf.py

@@ -1,16 +0,0 @@
-# from tika import parser
-#
-# raw = parser.from_file("GV_12.pdf")
-# raw = str(raw)
-#
-# safe_text = raw.encode('utf-8', errors='ignore')
-#
-# #safe_text = str(safe_text).replace("\n", "").replace("\\", "")
-# print('--- safe text ---' )
-# print(safe_text
-#
-# )
-
-
-import textract
-text = textract.process("GV_12.pdf")

+ 14 - 14
read_text_lines.py

@@ -1,9 +1,9 @@
 import csv
 import math
 
-def printsection(b):
+def printsection(buffer, file_out):
     #print(b)
-    obj = dict(zip(b[::2], b[1::2]))
+    obj = dict(zip(buffer[::2], buffer[1::2]))
     for keys, values in obj.items():
         if keys == '1':
             try:
@@ -12,7 +12,7 @@ def printsection(b):
                 #print("\n")
 
                 row = [values,math.floor(float(obj['10'])),math.floor(float(obj['20']))]
-                with open('text.csv', 'a') as csvFile:
+                with open(file_out, 'a') as csvFile:
                     writer = csv.writer(csvFile, delimiter =';')
                     if row[0] != '':
                         writer.writerow(row)
@@ -28,14 +28,14 @@ def printsection(b):
     #if obj.get('1'):
     #    print('{}'.format(obj['1']))
 
-
-buffer = []
-file = open("Stahl_Adapterplatte.DXF", "r")
-for line in file:
-    line = line.strip()
-    #print(line)
-    if line == '100':         # we've started a new section, so
-        printsection(buffer)      # handle the captured section
-        buffer = []             # and start a new one
-    buffer.append(line)
-printsection(buffer)
+def read(file, file_out):
+    buffer = []
+    file = open(file, "r")
+    for line in file:
+        line = line.strip()
+        #print(line)
+        if line == '100':         # we've started a new section, so
+            printsection(buffer, file_out)      # handle the captured section
+            buffer = []             # and start a new one
+        buffer.append(line)
+    printsection(buffer, file_out)