# coding=utf8 import re regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" regex1 = r"([A-Z]\s?\W\s?\d\d?\s?\s?\W\s?\d\d?\s?\W)" #ti get the bezeichnungen raus regex2= r"([a-zA-Z]{3,})" #alle wörter raus??? extracted_dimensions = [] file=open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5152166_Rev04.txt', 'r') text= file.read() file.close() matches = re.findall(regex, text, re.MULTILINE) for match in matches: extracted_dimensions.append(match.strip()) #next part replaces everything you do not need with whitespace string = "test test2 test3" if re.search("()", string, re.IGNORECASE): r = re.compile(r"", re.IGNORECASE) string = r.sub(r'', string) print(extracted_dimensions)