read_isos.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import nltk
  2. import re
  3. from tika import parser
  4. einleitung = False
  5. raw = parser.from_file('/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO2768-1.PDF')
  6. #raw = parser.from_file('iso_documents/ISO286-2.PDF')
  7. print(raw['content'])
  8. #text = raw['content']
  9. #sent_text = nltk.sent_tokenize(text)
  10. #tokenized_text = nltk.word_tokenize(sent_text.split)
  11. #tagged = nltk.pos_tag(tokenized_text)
  12. #match = text.concordance('Toleranz')
  13. #for text in sent_text:
  14. # if "Toleranz" in text and einleitung is True:
  15. # print(text)
  16. # if "Einleitung" in text:
  17. # einleitung = True
  18. import subprocess
  19. #subprocess.check_output(['ls','-l']) #all that is technically needed...
  20. cmd = 'pdftotext -layout "/home/bscheibel/PycharmProjects/dxf_reader/iso_documents/ISO8015.PDF"'
  21. print(subprocess.Popen(cmd, shell=True))
  22. #convert iso document to text
  23. text = "iso_documents/ISO8015.txt"
  24. #search for table of content with regex
  25. contents = []
  26. regex = r"(.*?)[\W]+(\d+)(?=\n|$)"
  27. r"([^\.]\d\.?\d?\.?\d?\.?\d?)\s([a-zA-Z]*)\s([a-zA-Z]*)\s*([a-zA-Z]*)\W?\s*([a-zA-Z]+)\s*\.{10,}([\d]+)"
  28. matches = re.finditer(regex, text, re.MULTILINE)
  29. #contents = re.findall(r"(.*?)[\W]+(\d+)(?=\n|$)", text, flags=re.M)
  30. #print(contents)
  31. for matchNum, match in enumerate(matches, start=1):
  32. print("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum=matchNum, start=match.start(),
  33. end=match.end(), match=match.group()))
  34. for groupNum in range(0, len(match.groups())):
  35. groupNum = groupNum + 1
  36. print("Group {groupNum} found at {start}-{end}: {group}".format(groupNum=groupNum, start=match.start(groupNum),
  37. end=match.end(groupNum),
  38. group=match.group(groupNum)))
  39. #only search for sections with toleranzen/abmaße