parser.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # Licensed to the Apache Software Foundation (ASF) under one or more
  4. # contributor license agreements. See the NOTICE file distributed with
  5. # this work for additional information regarding copyright ownership.
  6. # The ASF licenses this file to You under the Apache License, Version 2.0
  7. # (the "License"); you may not use this file except in compliance with
  8. # the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. from .tika import parse1, callServer, ServerEndpoint
  19. import os
  20. import json
  21. def from_file(filename, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None):
  22. '''
  23. Parses a file for metadata and content
  24. :param filename: path to file which needs to be parsed
  25. :param serverEndpoint: Server endpoint url
  26. :param xmlContent: Whether or not XML content be requested.
  27. Default is 'False', which results in text content.
  28. :param headers: Request headers to be sent to the tika reset server, should
  29. be a dictionary. This is optional
  30. :return: dictionary having 'metadata' and 'content' keys.
  31. 'content' has a str value and metadata has a dict type value.
  32. '''
  33. if not xmlContent:
  34. jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path)
  35. else:
  36. jsonOutput = parse1('all', filename, serverEndpoint, services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/xml'},
  37. headers=headers, config_path=config_path)
  38. return _parse(jsonOutput)
  39. def from_buffer(string, serverEndpoint=ServerEndpoint, xmlContent=False, headers=None, config_path=None):
  40. '''
  41. Parses the content from buffer
  42. :param string: Buffer value
  43. :param serverEndpoint: Server endpoint. This is optional
  44. :param xmlContent: Whether or not XML content be requested.
  45. Default is 'False', which results in text content.
  46. :param headers: Request headers to be sent to the tika reset server, should
  47. be a dictionary. This is optional
  48. :return:
  49. '''
  50. headers = headers or {}
  51. headers.update({'Accept': 'application/json'})
  52. if not xmlContent:
  53. status, response = callServer('put', serverEndpoint, '/rmeta/text', string, headers, False, config_path=config_path)
  54. else:
  55. status, response = callServer('put', serverEndpoint, '/rmeta/xml', string, headers, False, config_path=config_path)
  56. return _parse((status,response))
  57. def _parse(jsonOutput):
  58. '''
  59. Parses JSON response from Tika REST API server
  60. :param jsonOutput: JSON output from Tika Server
  61. :return: a dictionary having 'metadata' and 'content' values
  62. '''
  63. parsed={}
  64. if not jsonOutput:
  65. return parsed
  66. parsed["status"] = jsonOutput[0]
  67. if jsonOutput[1] == None or jsonOutput[1] == "":
  68. return parsed
  69. realJson = json.loads(jsonOutput[1])
  70. content = ""
  71. for js in realJson:
  72. if "X-TIKA:content" in js:
  73. content += js["X-TIKA:content"]
  74. if content == "":
  75. content = None
  76. parsed["content"] = content
  77. parsed["metadata"] = {}
  78. for js in realJson:
  79. for n in js:
  80. if n != "X-TIKA:content":
  81. if n in parsed["metadata"]:
  82. if not isinstance(parsed["metadata"][n], list):
  83. parsed["metadata"][n] = [parsed["metadata"][n]]
  84. parsed["metadata"][n].append(js[n])
  85. else:
  86. parsed["metadata"][n] = js[n]
  87. return parsed