regex_extraction.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. # coding=utf8
  2. import re
  3. import csv_to_pandadf
  4. def clean(extracted_dimensions):
  5. #next part extracts the isos and removes everything we dont need like just text or detail/maßstab, einzelne buchstaben und zahlen
  6. for dim in extracted_dimensions:
  7. if re.match(regex_isos, dim): #isos
  8. match = re.findall(regex_isos,dim)
  9. isos.append(match[0])
  10. extracted_dimensions.remove(dim)
  11. i = 0
  12. new_matches = []
  13. for match in extracted_dimensions:
  14. # print(match)
  15. match = match.split('\n')[0]
  16. # if len(match)>1:
  17. # extraction.append(match[1])
  18. # print(match[1])
  19. if not re.search(reg_all, match):
  20. new_matches.append(match)
  21. i += 1
  22. #print(isos)
  23. #print(extracted_dimensions)
  24. return isos, new_matches
  25. def print_clean(dims):
  26. mal = "no"
  27. vorzeichen = "no"
  28. for dim in dims:
  29. if re.match(r"b\s\d*\W?\d*\s.",dim):
  30. print("Rechtwinkligkeit")
  31. print(dim)
  32. continue
  33. if re.match(r"g\s\d*\W?\d*", dim):
  34. print("Zylinderform")
  35. print(dim)
  36. continue
  37. if re.match(r"g\s\d*\W?\d*", dim):
  38. print("Parallelität")
  39. print(dim)
  40. continue
  41. if re.match(r"g\s\d*\W?\d*", dim):
  42. print("Zylinderform")
  43. print(dim)
  44. continue
  45. if re.match(r"g\s\d*\W?\d*", dim):
  46. print("Konzentrizität")
  47. print(dim)
  48. continue
  49. if re.match(r"i\s\d*\W?\d*", dim):
  50. print("Symmetrie")
  51. print(dim)
  52. continue
  53. if re.match(r"j\s\d*\W?\d*", dim):
  54. print("Ortstoleranz/Mittelpunkt")
  55. print(dim)
  56. if re.match(r"n\d*", dim):
  57. print("Durchmesser")
  58. print(dim)
  59. if "É" in dim:
  60. print("Modifikator")
  61. print(dim)
  62. continue
  63. ####nicht dabei: neigungswinkel und lauftoleranzen
  64. if re.match(r"R\d*$",dim):
  65. print("Radius")
  66. print(dim)
  67. continue
  68. if "°" in dim:
  69. print("Grad")
  70. print(dim)
  71. continue
  72. if re.match(r"Ø\s*\d*\W?\d*", dim):
  73. print("Durchmesser")
  74. print(dim)
  75. continue
  76. def merge(dims):
  77. last_item = ""
  78. i = 0
  79. new_dims = []
  80. for dim in dims:
  81. dims[i] = dim.replace('È','GG')
  82. if re.match(r"\d?x$", last_item):
  83. last_item = last_item + " " + dims[i]
  84. if re.match(r"R0", dim):
  85. last_item = dim + last_item
  86. if re.match(r"^°$", last_item):
  87. last_item = dim + last_item
  88. new_dims.append(last_item)
  89. i += 1
  90. last_item = dim
  91. return dims
  92. regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
  93. regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
  94. regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
  95. regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
  96. reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
  97. reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
  98. reg_all = re.compile(r"(^[A-Z]{1}-?[A-Z]?\s*$)|([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,}?\W)|(?!0)(^\d{1}\s*$|A\d{1}|\d/\d)")
  99. extracted_dimensions = []
  100. text = csv_to_pandadf.read_csv('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged_GV12.csv')
  101. """file = open('/home/bscheibel/PycharmProjects/dxf_reader/temporary/text_merged.csv', 'r')
  102. text = file.read()
  103. file.close()
  104. matches = re.findall(regex, text, re.MULTILINE) """
  105. for match in text:
  106. extracted_dimensions.append(match.strip())
  107. #print(extracted_dimensions)
  108. """for dim in extracted_dimensions:
  109. print( [dim] )"""
  110. isos = []
  111. isos, dims = clean(extracted_dimensions)
  112. for dim in dims:
  113. print(dim)
  114. #print(isos)
  115. new_dims = []
  116. new_dims = merge(dims)
  117. print(new_dims)
  118. #print_clean(dims)