read_html_to_csv.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435
  1. import re
  2. import csv
  3. with open("/home/bscheibel/PycharmProjects/dxf_reader/drawings/5129275_Rev01-GV12.html", "r") as f:
  4. with open('values.csv', 'w') as writeFile:
  5. for line in f.readlines():
  6. #print(line)
  7. row = []
  8. if "<word" in line:
  9. exMin = r"xMin=\"(\d*\.?\d*)"
  10. exMin = re.findall(exMin,line)[0]
  11. row.append(exMin)
  12. eyMin = r"yMin=\"(\d*\.?\d*)"
  13. eyMin = re.findall(eyMin,line)[0]
  14. row.append(eyMin)
  15. exMax = r"xMax=\"(\d*\.?\d*)"
  16. exMax = re.findall(exMax,line)[0]
  17. row.append(exMax)
  18. eyMax = r"yMax=\"(\d*\.?\d*)"
  19. eyMax = re.findall(eyMax,line)[0]
  20. row.append(eyMax)
  21. Text = r">(.+)<" #wieso wird was mit "" extrahiert???
  22. Text = re.findall(Text,line)[0]
  23. row.append(Text.replace(',','.'))
  24. avgX=(float(exMin)+float(exMax))/2.0
  25. row.append(avgX)
  26. avgY=(float(eyMin)+float(eyMax))/2.0
  27. row.append(avgY)
  28. row.append(False)
  29. writer = csv.writer(writeFile)
  30. writer.writerow(row)
  31. writeFile.close()