regex_extraction.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. # coding=utf8
  2. import re
  3. def clean(extracted_dimensions):
  4. #next part extracts the isos and removes everything we dont need like just text or the X:X stuff, einzelne buchstaben und zahlen
  5. for dim in extracted_dimensions:
  6. if re.match(regex_isos, dim): #isos
  7. match = re.findall(regex_isos,dim)
  8. isos.append(match[0])
  9. extracted_dimensions.remove(dim)
  10. for dim in extracted_dimensions:
  11. match =re.match(reg_all, dim)
  12. if match:
  13. #print(re.findall(reg_all,dim))
  14. #print(match[0])
  15. try:
  16. extracted_dimensions.remove(dim)
  17. except:
  18. print("error")
  19. #print(isos)
  20. #print(extracted_dimensions)
  21. return isos, extracted_dimensions
  22. def print_clean(extracted_dimensions):
  23. for dim in extracted_dimensions:
  24. if "b" in dim:
  25. print("Rechtwinkligkeit")
  26. print(dim)
  27. if "g" in dim:
  28. print("Zylinderform")
  29. print(dim)
  30. if "f" in dim:
  31. print("Parallelität")
  32. print(dim)
  33. if "c" in dim:
  34. print("Zylinderform")
  35. print(dim)
  36. if "r" in dim:
  37. print("Konzentrizität?")
  38. print(dim)
  39. if "i" in dim:
  40. print("Symmetrie")
  41. print(dim)
  42. if "j" in dim:
  43. print("Ortstoleranz/Mittelpunkt")
  44. print(dim)
  45. if "n" in dim:
  46. print("Durchmesser")
  47. print(dim)
  48. if "É" in dim:
  49. print("Modifikator")
  50. print(dim)
  51. ####nicht dabei: neigungswinkel und lauftoleranzen
  52. if "R" in dim:
  53. print("Radius")
  54. print(dim)
  55. if "°" in dim:
  56. print("Grad")
  57. if "Ø" in dim:
  58. print("Durchmesser")
  59. regex = r"(\S+\s{1,3}?\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s?\S*\S*\s+)" #alle gruppen von zahlen raus
  60. regex1 = r"([A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)" #ti get the bezeichnungen raus
  61. regex2 = r"((?!\d)(?!Rpk)[a-zA-Z]{3,})" #alle wörter raus??? außer Rpk
  62. regex_isos = r"(ISO\s\d\d\d\d?\W?\d?\W?\d?\W?\d?)" #get iso standards
  63. reg = r"(^\d{1}$)" #einzelne Zahlen raus #checked
  64. reg1 = r"(^[A-Z]{1}-?[A-Z]?$)" #einzelne Buchstaben raus #checked
  65. reg_all = r"(^(?!0)\d{1}$)|(^[A-Z]{1}-?[A-Z]?$)|(^[A-Z]\W?[A-Z]?\s?\W\s?\d\d?\s?\s?:\s?\d\d?\s?\W)|((?!\d)(?!Rpk)[a-zA-Z]{3,})"
  66. extracted_dimensions = []
  67. file = open('/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.txt', 'r')
  68. text = file.read()
  69. file.close()
  70. matches = re.findall(regex, text, re.MULTILINE)
  71. for match in matches:
  72. extracted_dimensions.append(match.strip())
  73. #print(extracted_dimensions)
  74. isos = []
  75. isos, dims = clean(extracted_dimensions)
  76. #print(isos)
  77. #dims = clean(dims)
  78. for dim in dims:
  79. print(dim)
  80. print_clean(dims)