regex_extraction.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # coding=utf8
  2. import re
  3. import csv_to_text
  4. import csv
  5. import pandas
  6. def clean(extracted_dimensions):
  7. #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
  8. isos = []
  9. for dim in extracted_dimensions:
  10. if re.match(regex_isos, dim): #isos
  11. match = re.findall(regex_isos,dim)
  12. print(match)
  13. isos.append(match[0])
  14. extracted_dimensions.remove(dim)
  15. i = 0
  16. new_matches = []
  17. for match in extracted_dimensions:
  18. # print(match)
  19. match = match.split('\n')[0]
  20. # if len(match)>1:
  21. # extraction.append(match[1])
  22. # print(match[1])
  23. if not re.search(reg_all, match):
  24. new_matches.append(match)
  25. i += 1
  26. #print(isos)
  27. #print(extracted_dimensions)
  28. return isos, new_matches
  29. def print_clean(dims):
  30. for dim in dims:
  31. if re.match(r"b\s\d*\W?\d*\s.",dim):
  32. dim = dim.replace('b', '⏊')
  33. continue
  34. if re.match(r"g\s\d*\W?\d*", dim):
  35. dim = dim.replace('g', '⌭ ')
  36. continue
  37. if re.match(r"f\s\d*\W?\d*", dim):
  38. dim = dim.replace('f', u"\u2225")
  39. continue
  40. if re.match(r"r\s\d*\W?\d*", dim):
  41. dim = dim.replace('r', '⌾')
  42. continue
  43. if re.match(r"i\s\d*\W?\d*", dim):
  44. dim = dim.replace('i', '⌯')
  45. continue
  46. if re.match(r"j\s\d*\W?\d*", dim):
  47. dim = dim.replace('j', '')
  48. continue
  49. if re.match(r"c\s+\d*", dim):
  50. dim = dim.replace('c', '⏥')
  51. continue
  52. if re.match(r"n\s+\d*", dim):
  53. dim = dim.replace('n', '⌀')
  54. continue
  55. if "É" in dim:
  56. dim = dim.replace('É', 'GG')
  57. continue
  58. ####nicht dabei: neigungswinkel und lauftoleranzen
  59. return dims
  60. def merge(dims):
  61. last_item = ""
  62. i = 0
  63. new_dims = []
  64. for dim in dims:
  65. if re.match(r"\d?x$", last_item):
  66. last_item = last_item + " " + dims[i]
  67. if re.match(r"R0", dim):
  68. last_item = dim + last_item
  69. if re.match(r"^°$", last_item):
  70. last_item = dim + last_item
  71. new_dims.append(last_item)
  72. i += 1
  73. last_item = dim
  74. return dims
  75. regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
  76. regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
  77. regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
  78. regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
  79. reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
  80. reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
  81. reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
  82. extracted_dimensions = []
  83. #text = csv_to_text.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
  84. file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
  85. #text = file.read()
  86. #file.close()
  87. text_df = pandas.read_csv(file)
  88. text = text_df['Text']
  89. #print(text)
  90. #matches = re.findall(regex, text, re.MULTILINE)
  91. for line in text:
  92. extracted_dimensions.append(line.strip())
  93. #print(extracted_dimensions)
  94. #isos = []
  95. isos, dims = clean(extracted_dimensions)
  96. print(isos)
  97. #new_dims = []
  98. new_dims = merge(dims)
  99. print(new_dims)
  100. dims = print_clean(dims)
  101. print(dims)