regex_extraction.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # coding=utf8
  2. import re
  3. import pandas
  4. def clean(extracted_dimensions):
  5. #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
  6. isos = []
  7. for line in extracted_dimensions:
  8. matches = re.findall(regex_isos,line)
  9. for match in matches:
  10. isos.append(match)
  11. i = 0
  12. new_matches = []
  13. for match in extracted_dimensions:
  14. match = match.split('\n')[0]
  15. if not re.search(reg_all, match):
  16. new_matches.append(match)
  17. i += 1
  18. #print(isos)
  19. #print(extracted_dimensions)
  20. return isos, new_matches
  21. def print_clean(dims):
  22. dims_new = []
  23. dimss = []
  24. #or dim in dims:
  25. # if "CT" in dim:
  26. # dim = re.split("CT",dim)
  27. # for di in dim:
  28. # dimss.extend(di)
  29. #print(dimss)
  30. for dim in dimss:
  31. if re.search(r"b\s\d*\W?\d*\s.",dim):
  32. dim = dim.replace('b', u"\u27C2")
  33. if re.search(r"g\s\d*\W?\d*", dim):
  34. dim = dim.replace('g', u"\u232D")
  35. if re.search(r"f\s\d*\W?\d*", dim):
  36. dim = dim.replace('f', u"\u2225")
  37. if re.search(r"r\s\d*\W?\d*", dim):
  38. dim = dim.replace('r', u"\u25CE")
  39. if re.search(r"i\s\d*\W?\d*", dim):
  40. dim = dim.replace('i', u"\u232F")
  41. if re.search(r"j\s\d*\W?\d*", dim):
  42. dim = dim.replace('j', u"\u2316")
  43. if re.search(r"d\s\d*\W?\d*", dim):
  44. dim = dim.replace('d', u"\u2313")
  45. if re.search(r"c\s+\d*", dim):
  46. dim = dim.replace('c', u"\u23E5")
  47. if re.search(r"n\s+\d*", dim):
  48. dim = dim.replace('n', u"\u2300")
  49. if "È" in dim:
  50. dim = dim.replace('È', 'GG')
  51. if "`" in dim:
  52. dim = dim.replace('`', u"\u00B1")
  53. if "#" in dim:
  54. dim = dim.replace('#', "↔")
  55. if "⌀" in dim:
  56. dim = dim.replace('⌀', "Ø")
  57. reg12 = re.compile(r"(\d{1,2}\.?\d{0,2})\s\+\s-\s(\d{1,2}\.?\d{0,2})\s(\d{1,2}\.?\d{0,2})") ##???? was machst du?? nach toleranzen suchen, mit +/- blabla
  58. g = re.search(reg12, dim)
  59. if g:
  60. dim = re.sub(reg12, g.group(1) + " + " + g.group(2) + " - " + g.group(3), dim) # +/- toleranzen schön darstellen
  61. dims_new.append(dim.strip())
  62. dimms = []
  63. i = 0
  64. for dim in dims_new:
  65. last_item = i - 1
  66. next_item = i + 1
  67. if not re.search(r"[a-zA-Z]{3,}|^\d\s\d$|^[a-zA-Z]{2,}\d.*$",dim) and not dim == "-":
  68. dimms.append(dim)
  69. ####nicht dabei: neigungswinkel und lauftoleranzen
  70. print(dimms)
  71. return dimms
  72. regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
  73. regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
  74. regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
  75. regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)|(EN\s\d*)" #get iso standards
  76. reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
  77. reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
  78. reg_all = re.compile(r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?|(EN\s\d*)|^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d\s\d\s\d\s\d\s\d)|BY|to:?|of|or|is|in|as|be|by |\d\d\d\d\d\d\d|\d\s\/\s\d")
  79. extracted_dimensions = []
  80. #text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
  81. #file = open('values_clusteredfromPDF_GV12.csv', 'r')
  82. #text = file.read()
  83. #file.close()
  84. #text_df = pandas.read_csv(file)
  85. def extract_pretty(input):
  86. #text = input['element']
  87. text_all = []
  88. for key, value in input:
  89. text_combined = ""
  90. #new_arr = ""
  91. # print(element)
  92. element = eval(element)
  93. for x in element:
  94. text_combined += x[4] + " "
  95. #print(x[4])
  96. text_all.append(text_combined)
  97. #print(text_all)
  98. #for line in text_combined:
  99. # extracted_dimensions.append(line.strip())
  100. isos, dims = clean(text_all)
  101. #print(isos)
  102. #isos, dims = clean(dims)
  103. new_dims = print_clean(dims)
  104. #for dim in dims:
  105. # print(dim)
  106. return new_dims