read_tables.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import tabula
  2. import camelot
  3. import subprocess
  4. import re
  5. #tables = tabula.read_pdf("iso_documents/ISO2768-1.PDF", pages=3)
  6. #for table in tables:
  7. # print(table)
  8. #pdftotext - layout!!!!
  9. #tabula.convert_into("iso_documents/ISO2768-1.PDF", "output_mit_tabula.csv", output_format="csv", pages='all', multiple_tables=True)
  10. #df = tabula.read_pdf("iso_documents/ISO1101.PDF", pages='all', multiple_tables=True)
  11. #print(df)
  12. def file_read(fname):
  13. content_array = []
  14. with open(fname) as f:
  15. # Content_list is the list that contains the read lines.
  16. for line in f:
  17. content_array.append(line.strip().replace(" ",""))
  18. print(content_array)
  19. #file_read('drawings/5129275_Rev01-GV12.txt')
  20. tables = camelot.read_pdf("/home/bscheibel/PycharmProjects/engineering_drawings_extraction/iso_documents/ISO2768-1.PDF", pages="3")
  21. tables.export('output_mit_camelot.csv', f='csv')
  22. output = subprocess.check_output(["less","/home/bscheibel/PycharmProjects/engineering_drawings_extraction/iso_documents/ISO2768-1.PDF"])
  23. print(output)
  24. re_data_prefix = re.compile("^[0-9]+[.].*$")
  25. re_data_fields = re.compile("(([^ ]+[ ]?)+)")
  26. for line in output.splitlines():
  27. if re_data_prefix.match(line):
  28. for l in re_data_fields.findall(line):
  29. print[l[0].strip()]