123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- # coding=utf8
- import re
- import csv_to_text
- import csv
- import pandas
- def clean(extracted_dimensions):
- #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
- isos = []
- for dim in extracted_dimensions:
- if re.match(regex_isos, dim): #isos
- match = re.findall(regex_isos,dim)
- print(match)
- isos.append(match[0])
- extracted_dimensions.remove(dim)
- i = 0
- new_matches = []
- for match in extracted_dimensions:
- # print(match)
- match = match.split('\n')[0]
- # if len(match)>1:
- # extraction.append(match[1])
- # print(match[1])
- if not re.search(reg_all, match):
- new_matches.append(match)
- i += 1
- #print(isos)
- #print(extracted_dimensions)
- return isos, new_matches
- def print_clean(dims):
- for dim in dims:
- if re.match(r"b\s\d*\W?\d*\s.",dim):
- dim = dim.replace('b', '⏊')
- continue
- if re.match(r"g\s\d*\W?\d*", dim):
- dim = dim.replace('g', '⌭ ')
- continue
- if re.match(r"f\s\d*\W?\d*", dim):
- dim = dim.replace('f', u"\u2225")
- continue
- if re.match(r"r\s\d*\W?\d*", dim):
- dim = dim.replace('r', '⌾')
- continue
- if re.match(r"i\s\d*\W?\d*", dim):
- dim = dim.replace('i', '⌯')
- continue
- if re.match(r"j\s\d*\W?\d*", dim):
- dim = dim.replace('j', '')
- continue
- if re.match(r"c\s+\d*", dim):
- dim = dim.replace('c', '⏥')
- continue
- if re.match(r"n\s+\d*", dim):
- dim = dim.replace('n', '⌀')
- continue
- if "É" in dim:
- dim = dim.replace('É', 'GG')
- continue
- ####nicht dabei: neigungswinkel und lauftoleranzen
- return dims
- def merge(dims):
- last_item = ""
- i = 0
- new_dims = []
- for dim in dims:
- if re.match(r"\d?x$", last_item):
- last_item = last_item + " " + dims[i]
- if re.match(r"R0", dim):
- last_item = dim + last_item
- if re.match(r"^°$", last_item):
- last_item = dim + last_item
- new_dims.append(last_item)
- i += 1
- last_item = dim
- return dims
- regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
- regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
- regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
- regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
- reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
- reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
- reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
- extracted_dimensions = []
- #text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
- file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
- #text = file.read()
- #file.close()
- text_df = pandas.read_csv(file)
- text = text_df['Text']
- #print(text)
- #matches = re.findall(regex, text, re.MULTILINE)
- for line in text:
- extracted_dimensions.append(line.strip())
- #print(extracted_dimensions)
- #isos = []
- isos, dims = clean(extracted_dimensions)
- print(isos)
- #new_dims = []
- new_dims = merge(dims)
- print(new_dims)
- dims = print_clean(dims)
- print(dims)
|