unpack.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # Licensed to the Apache Software Foundation (ASF) under one or more
  4. # contributor license agreements. See the NOTICE file distributed with
  5. # this work for additional information regarding copyright ownership.
  6. # The ASF licenses this file to You under the Apache License, Version 2.0
  7. # (the "License"); you may not use this file except in compliance with
  8. # the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. from .tika import parse1, callServer, ServerEndpoint
  19. import tarfile
  20. from io import BytesIO, TextIOWrapper
  21. import csv
  22. from sys import version_info
  23. # Python 3 introduced .readable() to tarfile extracted files objects - this
  24. # is required to wrap a TextIOWrapper around the object. However, wrapping
  25. # with TextIOWrapper is only required for csv.reader() in Python 3, so the
  26. # tarfile returned object can be used as is in earlier versions.
  27. _text_wrapper = TextIOWrapper if version_info.major >= 3 else lambda x: x
  28. def from_file(filename, serverEndpoint=ServerEndpoint):
  29. '''
  30. Parse from file
  31. :param filename: file
  32. :param serverEndpoint: Tika server end point (optional)
  33. :return:
  34. '''
  35. tarOutput = parse1('unpack', filename, serverEndpoint,
  36. responseMimeType='application/x-tar',
  37. services={'meta': '/meta', 'text': '/tika',
  38. 'all': '/rmeta/xml', 'unpack': '/unpack/all'},
  39. rawResponse=True)
  40. return _parse(tarOutput)
  41. def from_buffer(string, serverEndpoint=ServerEndpoint):
  42. '''
  43. Parse from buffered content
  44. :param string: buffered content
  45. :param serverEndpoint: Tika server URL (Optional)
  46. :return: parsed content
  47. '''
  48. status, response = callServer('put', serverEndpoint, '/unpack/all', string,
  49. {'Accept': 'application/x-tar'}, False,
  50. rawResponse=True)
  51. return _parse((status, response))
  52. def _parse(tarOutput):
  53. parsed = {}
  54. if not tarOutput:
  55. return parsed
  56. elif tarOutput[1] is None or tarOutput[1] == b"":
  57. return parsed
  58. tarFile = tarfile.open(fileobj=BytesIO(tarOutput[1]))
  59. # get the member names
  60. memberNames = list(tarFile.getnames())
  61. # extract the metadata
  62. metadata = {}
  63. if "__METADATA__" in memberNames:
  64. memberNames.remove("__METADATA__")
  65. metadataMember = tarFile.getmember("__METADATA__")
  66. if not metadataMember.issym() and metadataMember.isfile():
  67. metadataFile = _text_wrapper(tarFile.extractfile(metadataMember))
  68. metadataReader = csv.reader(metadataFile)
  69. for metadataLine in metadataReader:
  70. # each metadata line comes as a key-value pair, with list values
  71. # returned as extra values in the line - convert single values
  72. # to non-list values to be consistent with parser metadata
  73. assert len(metadataLine) >= 2
  74. if len(metadataLine) > 2:
  75. metadata[metadataLine[0]] = metadataLine[1:]
  76. else:
  77. metadata[metadataLine[0]] = metadataLine[1]
  78. # get the content
  79. content = ""
  80. if "__TEXT__" in memberNames:
  81. memberNames.remove("__TEXT__")
  82. contentMember = tarFile.getmember("__TEXT__")
  83. if not contentMember.issym() and contentMember.isfile():
  84. if version_info.major >= 3:
  85. content = _text_wrapper(tarFile.extractfile(contentMember), encoding='utf8').read()
  86. else:
  87. content = tarFile.extractfile(contentMember).read().decode('utf8')
  88. # get the remaining files as attachments
  89. attachments = {}
  90. for attachment in memberNames:
  91. attachmentMember = tarFile.getmember(attachment)
  92. if not attachmentMember.issym() and attachmentMember.isfile():
  93. attachments[attachment] = tarFile.extractfile(attachmentMember).read()
  94. parsed["content"] = content
  95. parsed["metadata"] = metadata
  96. parsed["attachments"] = attachments
  97. return parsed