regex_extraction.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # coding=utf8
  2. import re
  3. import pandas
  4. def clean(extracted_dimensions):
  5. #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
  6. isos = []
  7. for line in extracted_dimensions:
  8. matches = re.findall(regex_isos,line)
  9. for match in matches:
  10. isos.append(match)
  11. i = 0
  12. new_matches = []
  13. for match in extracted_dimensions:
  14. match = match.split('\n')[0]
  15. if not re.search(reg_all, match):
  16. new_matches.append(match)
  17. i += 1
  18. #print(isos)
  19. #print(extracted_dimensions)
  20. return isos, new_matches
  21. def print_clean(dims):
  22. dims_new = []
  23. dimss = []
  24. for dim in dims:
  25. dim = re.split("CT",dim)
  26. dimss.extend(dim)
  27. #print(dimss)
  28. for dim in dimss:
  29. if re.search(r"b\s\d*\W?\d*\s.",dim):
  30. dim = dim.replace('b', u"\u27C2")
  31. if re.search(r"g\s\d*\W?\d*", dim):
  32. dim = dim.replace('g', u"\u232D")
  33. if re.search(r"f\s\d*\W?\d*", dim):
  34. dim = dim.replace('f', u"\u2225")
  35. if re.search(r"r\s\d*\W?\d*", dim):
  36. dim = dim.replace('r', u"\u25CE")
  37. if re.search(r"i\s\d*\W?\d*", dim):
  38. dim = dim.replace('i', u"\u232F")
  39. if re.search(r"j\s\d*\W?\d*", dim):
  40. dim = dim.replace('j', u"\u2316")
  41. if re.search(r"d\s\d*\W?\d*", dim):
  42. dim = dim.replace('d', u"\u2313")
  43. if re.search(r"c\s+\d*", dim):
  44. dim = dim.replace('c', u"\u23E5")
  45. if re.search(r"n\s+\d*", dim):
  46. dim = dim.replace('n', u"\u2300")
  47. if "È" in dim:
  48. dim = dim.replace('È', 'GG')
  49. if "`" in dim:
  50. dim = dim.replace('`', u"\u00B1")
  51. if "#" in dim:
  52. dim = dim.replace('#', "↔")
  53. if "⌀" in dim:
  54. dim = dim.replace('⌀', "Ø")
  55. reg12 = re.compile(r"(\d{1,2}\.?\d{0,2})\s\+\s-\s(\d{1,2}\.?\d{0,2})\s(\d{1,2}\.?\d{0,2})")
  56. g = re.search(reg12, dim)
  57. if g:
  58. dim = re.sub(reg12, g.group(1) + " + " + g.group(2) + " - " + g.group(3), dim)
  59. dims_new.append(dim.strip())
  60. dimms = []
  61. i = 0
  62. for dim in dims_new:
  63. last_item = i - 1
  64. next_item = i + 1
  65. if not re.search(r"[a-zA-Z]{3,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$",dim):
  66. dimms.append(dim)
  67. ####nicht dabei: neigungswinkel und lauftoleranzen
  68. return dimms
  69. regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
  70. regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
  71. regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
  72. regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)|(EN\s\d*)" #get iso standards
  73. reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
  74. reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
  75. reg_all = re.compile(r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?|(EN\s\d*)|^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d\s\d\s\d\s\d\s\d)|BY|to:?|of|or|is|in|as|be|by |\d\d\d\d\d\d\d|\d\s\/\s\d")
  76. extracted_dimensions = []
  77. #text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
  78. file = open('values_clusteredfromPDF_GV12.csv', 'r')
  79. #text = file.read()
  80. #file.close()
  81. text_df = pandas.read_csv(file)
  82. text = text_df['Text']
  83. #print(text)
  84. #matches = re.findall(regex, text, re.MULTILINE)
  85. for line in text:
  86. extracted_dimensions.append(line.strip())
  87. isos, dims = clean(extracted_dimensions)
  88. #print(isos)
  89. isos, dims = clean(dims)
  90. new_dims = print_clean(dims)
  91. for dim in new_dims:
  92. print(dim)