123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- # coding=utf8
- import re
- import pandas
- def clean(extracted_dimensions):
- #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
- isos = []
- for line in extracted_dimensions:
- matches = re.findall(regex_isos,line)
- for match in matches:
- isos.append(match)
- i = 0
- new_matches = []
- for match in extracted_dimensions:
- match = match.split('\n')[0]
- if not re.search(reg_all, match):
- new_matches.append(match)
- i += 1
- #print(isos)
- #print(extracted_dimensions)
- return isos, new_matches
- def print_clean(dims):
- dims_new = []
- dimss = []
- for dim in dims:
- dim = re.split("CT",dim)
- dimss.extend(dim)
- #print(dimss)
- for dim in dimss:
- if re.search(r"b\s\d*\W?\d*\s.",dim):
- dim = dim.replace('b', u"\u27C2")
- if re.search(r"g\s\d*\W?\d*", dim):
- dim = dim.replace('g', u"\u232D")
- if re.search(r"f\s\d*\W?\d*", dim):
- dim = dim.replace('f', u"\u2225")
- if re.search(r"r\s\d*\W?\d*", dim):
- dim = dim.replace('r', u"\u25CE")
- if re.search(r"i\s\d*\W?\d*", dim):
- dim = dim.replace('i', u"\u232F")
- if re.search(r"j\s\d*\W?\d*", dim):
- dim = dim.replace('j', u"\u2316")
- if re.search(r"d\s\d*\W?\d*", dim):
- dim = dim.replace('d', u"\u2313")
- if re.search(r"c\s+\d*", dim):
- dim = dim.replace('c', u"\u23E5")
- if re.search(r"n\s+\d*", dim):
- dim = dim.replace('n', u"\u2300")
- if "È" in dim:
- dim = dim.replace('È', 'GG')
- if "`" in dim:
- dim = dim.replace('`', u"\u00B1")
- if "#" in dim:
- dim = dim.replace('#', "↔")
- if "⌀" in dim:
- dim = dim.replace('⌀', "Ø")
- reg12 = re.compile(r"(\d{1,2}\.?\d{0,2})\s\+\s-\s(\d{1,2}\.?\d{0,2})\s(\d{1,2}\.?\d{0,2})")
- g = re.search(reg12, dim)
- if g:
- dim = re.sub(reg12, g.group(1) + " + " + g.group(2) + " - " + g.group(3), dim)
- dims_new.append(dim.strip())
- dimms = []
- i = 0
- for dim in dims_new:
- last_item = i - 1
- next_item = i + 1
- if not re.search(r"[a-zA-Z]{3,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$",dim):
- dimms.append(dim)
- ####nicht dabei: neigungswinkel und lauftoleranzen
- return dimms
- regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
- regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
- regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
- regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)|(EN\s\d*)" #get iso standards
- reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
- reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
- reg_all = re.compile(r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?|(EN\s\d*)|^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d\s\d\s\d\s\d\s\d)|BY|to:?|of|or|is|in|as|be|by |\d\d\d\d\d\d\d|\d\s\/\s\d")
- extracted_dimensions = []
- #text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
- file = open('values_clusteredfromPDF_GV12.csv', 'r')
- #text = file.read()
- #file.close()
- text_df = pandas.read_csv(file)
- text = text_df['Text']
- #print(text)
- #matches = re.findall(regex, text, re.MULTILINE)
- for line in text:
- extracted_dimensions.append(line.strip())
- isos, dims = clean(extracted_dimensions)
- #print(isos)
- isos, dims = clean(dims)
- new_dims = print_clean(dims)
- for dim in new_dims:
- print(dim)
|