@@ -1,27 +1,20 @@
# coding=utf8
import re
-import csv_to_text
-import csv
import pandas
def clean(extracted_dimensions):
#next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
isos = []
- for dim in extracted_dimensions:
- if re.match(regex_isos, dim): #isos
- match = re.findall(regex_isos,dim)
- print(match)
- isos.append(match[0])
- extracted_dimensions.remove(dim)
+ for line in extracted_dimensions:
+ matches = re.findall(regex_isos,line)
+ for match in matches:
+ isos.append(match)
i = 0
new_matches = []
for match in extracted_dimensions:
- # print(match)
match = match.split('\n')[0]
- # if len(match)>1:
- # extraction.append(match[1])
- # print(match[1])
if not re.search(reg_all, match):
i += 1
@@ -32,68 +25,70 @@ def clean(extracted_dimensions):
def print_clean(dims):
+ dims_new = []
+ dimss = []
for dim in dims:
- if re.match(r"b\s\d*\W?\d*\s.",dim):
- dim = dim.replace('b', '⏊')
- continue
- if re.match(r"g\s\d*\W?\d*", dim):
- dim = dim.replace('g', '⌭ ')
- continue
- if re.match(r"f\s\d*\W?\d*", dim):
+ dim = re.split("CT",dim)
+ dimss.extend(dim)
+ #print(dimss)
+ for dim in dimss:
+ if re.search(r"b\s\d*\W?\d*\s.",dim):
+ dim = dim.replace('b', u"\u27C2")
+ if re.search(r"g\s\d*\W?\d*", dim):
+ dim = dim.replace('g', u"\u232D")
+ if re.search(r"f\s\d*\W?\d*", dim):
dim = dim.replace('f', u"\u2225")
- continue
- if re.match(r"r\s\d*\W?\d*", dim):
- dim = dim.replace('r', '⌾')
- continue
- if re.match(r"i\s\d*\W?\d*", dim):
- dim = dim.replace('i', '⌯')
- continue
- if re.match(r"j\s\d*\W?\d*", dim):
- dim = dim.replace('j', '')
- continue
- if re.match(r"c\s+\d*", dim):
- dim = dim.replace('c', '⏥')
- continue
- if re.match(r"n\s+\d*", dim):
- dim = dim.replace('n', '⌀')
- continue
- if "É" in dim:
- dim = dim.replace('É', 'GG')
- continue
+ if re.search(r"r\s\d*\W?\d*", dim):
+ dim = dim.replace('r', u"\u25CE")
+ if re.search(r"i\s\d*\W?\d*", dim):
+ dim = dim.replace('i', u"\u232F")
+ if re.search(r"j\s\d*\W?\d*", dim):
+ dim = dim.replace('j', u"\u2316")
+ if re.search(r"d\s\d*\W?\d*", dim):
+ dim = dim.replace('d', u"\u2313")
+ if re.search(r"c\s+\d*", dim):
+ dim = dim.replace('c', u"\u23E5")
+ if re.search(r"n\s+\d*", dim):
+ dim = dim.replace('n', u"\u2300")
+ if "È" in dim:
+ dim = dim.replace('È', 'GG')
+ if "`" in dim:
+ dim = dim.replace('`', u"\u00B1")
+ if "#" in dim:
+ dim = dim.replace('#', "↔")
+ if "⌀" in dim:
+ dim = dim.replace('⌀', "Ø")
+ reg12 = re.compile(r"(\d{1,2}\.?\d{0,2})\s\+\s-\s(\d{1,2}\.?\d{0,2})\s(\d{1,2}\.?\d{0,2})")
+ g = re.search(reg12, dim)
+ if g:
+ dim = re.sub(reg12, g.group(1) + " + " + g.group(2) + " - " + g.group(3), dim)
+ dims_new.append(dim.strip())
+ dimms = []
+ i = 0
+ for dim in dims_new:
+ last_item = i - 1
+ next_item = i + 1
+ if not re.search(r"[a-zA-Z]{3,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$",dim):
+ dimms.append(dim)
####nicht dabei: neigungswinkel und lauftoleranzen
- return dims
-def merge(dims):
- last_item = ""
- i = 0
- new_dims = []
- for dim in dims:
- if re.match(r"\d?x$", last_item):
- last_item = last_item + " " + dims[i]
- if re.match(r"R0", dim):
- last_item = dim + last_item
- if re.match(r"^°$", last_item):
- last_item = dim + last_item
- new_dims.append(last_item)
- i += 1
- last_item = dim
- return dims
+ return dimms
regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
-regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
+regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)|(EN\s\d*)" #get iso standards
reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
-reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
+reg_all = re.compile(r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?|(EN\s\d*)|^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d\s\d\s\d\s\d\s\d)|BY|to:?|of|or|is|in|as|be|by |\d\d\d\d\d\d\d|\d\s\/\s\d")
extracted_dimensions = []
#text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
-file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
+file = open('text_merged.csv', 'r')
#text = file.read()
text_df = pandas.read_csv(file)
@@ -102,13 +97,10 @@ text = text_df['Text']
#matches = re.findall(regex, text, re.MULTILINE)
for line in text:
-#isos = []
-isos, dims = clean(extracted_dimensions)
-#new_dims = []
-new_dims = merge(dims)
-dims = print_clean(dims)
+isos, dims = clean(extracted_dimensions)
+isos, dims = clean(dims)
+new_dims = print_clean(dims)
+for dim in new_dims:
+ print(dim)