tika.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. # Licensed to the Apache Software Foundation (ASF) under one or more
  4. # contributor license agreements. See the NOTICE file distributed with
  5. # this work for additional information regarding copyright ownership.
  6. # The ASF licenses this file to You under the Apache License, Version 2.0
  7. # (the "License"); you may not use this file except in compliance with
  8. # the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS,
  14. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. # See the License for the specific language governing permissions and
  16. # limitations under the License.
  17. #
  18. # Module documentation
  19. '''
  20. Tika Python module provides Python API client to Aapche Tika Server.
  21. **Example usage**::
  22. import tika
  23. from tika import parser
  24. parsed = parser.from_file('/path/to/file')
  25. print(parsed["metadata"])
  26. print(parsed["content"])
  27. Visit https://github.com/chrismattmann/tika-python to learn more about it.
  28. **Detect IANA MIME Type**::
  29. from tika import detector
  30. print(detector.from_file('/path/to/file'))
  31. **Detect Language**::
  32. from tika import language
  33. print(language.from_file('/path/to/file'))
  34. **Use Tika Translate**::
  35. from tika import translate
  36. print(translate.from_file('/path/to/file', 'srcLang', 'destLang')
  37. # Use auto Language detection feature
  38. print(translate.from_file('/path/to/file', 'destLang')
  39. ***Tika-Python Configuration***
  40. You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
  41. for details on writing configuration files. Configuration is set the first time the server is started.
  42. To use a configuration file with a parser, or detector:
  43. parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
  44. or:
  45. detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
  46. or:
  47. detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
  48. '''
  49. USAGE = """
  50. tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>
  51. tika.py parse all test.pdf test2.pdf (write output JSON metadata files for test1.pdf_meta.json and test2.pdf_meta.json)
  52. tika.py detect type test.pdf (returns mime-type as text/plain)
  53. tika.py language file french.txt (returns language e.g., fr as text/plain)
  54. tika.py translate fr:en french.txt (translates the file french.txt from french to english)
  55. tika.py config mime-types (see what mime-types the Tika Server can handle)
  56. A simple python and command-line client for Tika using the standalone Tika server (JAR file).
  57. All commands return results in JSON format by default (except text in text/plain).
  58. To parse docs, use:
  59. tika.py parse <meta | text | all> <path>
  60. To check the configuration of the Tika server, use:
  61. tika.py config <mime-types | detectors | parsers>
  62. Commands:
  63. parse = parse the input file and write a JSON doc file.ext_meta.json containing the extracted metadata, text, or both
  64. detect type = parse the stream and 'detect' the MIME/media type, return in text/plain
  65. language file = parse the file stream and identify the language of the text, return its 2 character code in text/plain
  66. translate src:dest = parse and extract text and then translate the text from source language to destination language
  67. config = return a JSON doc describing the configuration of the Tika server (i.e. mime-types it
  68. can handle, or installed detectors or parsers)
  69. Arguments:
  70. urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika
  71. Switches:
  72. --verbose, -v = verbose mode
  73. --encode, -e = encode response in UTF-8
  74. --csv, -c = report detect output in comma-delimited format
  75. --server <TikaServerEndpoint> = use a remote Tika Server at this endpoint, otherwise use local server
  76. --install <UrlToTikaServerJar> = download and exec Tika Server (JAR file), starting server on default port 9998
  77. Example usage as python client:
  78. -- from tika import runCommand, parse1
  79. -- jsonOutput = runCommand('parse', 'all', filename)
  80. or
  81. -- jsonOutput = parse1('all', filename)
  82. """
  83. import sys, os, getopt, time, codecs, re
  84. try:
  85. unicode_string = unicode
  86. binary_string = str
  87. except NameError:
  88. unicode_string = str
  89. binary_string = bytes
  90. try:
  91. from urllib import urlretrieve
  92. except ImportError:
  93. from urllib.request import urlretrieve
  94. try:
  95. from urlparse import urlparse
  96. except ImportError:
  97. from urllib.parse import urlparse as urlparse
  98. try:
  99. from rfc6266 import build_header
  100. def make_content_disposition_header(fn):
  101. return build_header(os.path.basename(fn)).decode('ascii')
  102. except ImportError:
  103. def make_content_disposition_header(fn):
  104. return 'attachment; filename=%s' % os.path.basename(fn)
  105. if sys.version_info[0] < 3:
  106. open = codecs.open
  107. import requests
  108. import socket
  109. import tempfile
  110. import hashlib
  111. import platform
  112. from subprocess import Popen
  113. from subprocess import STDOUT
  114. from os import walk
  115. import logging
  116. log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir())
  117. log_file = os.path.join(log_path, 'tika.log')
  118. logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")
  119. log = logging.getLogger('tika.tika')
  120. # File logs
  121. fileHandler = logging.FileHandler(log_file)
  122. fileHandler.setFormatter(logFormatter)
  123. log.addHandler(fileHandler)
  124. # Stdout logs
  125. consoleHandler = logging.StreamHandler()
  126. consoleHandler.setFormatter(logFormatter)
  127. log.addHandler(consoleHandler)
  128. # Log level
  129. log.setLevel(logging.INFO)
  130. Windows = True if platform.system() == "Windows" else False
  131. TikaVersion = os.getenv('TIKA_VERSION', '1.19')
  132. TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
  133. TikaFilesPath = tempfile.gettempdir()
  134. TikaServerLogFilePath = log_path
  135. TikaServerJar = os.getenv(
  136. 'TIKA_SERVER_JAR',
  137. "http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/"+TikaVersion+"/tika-server-"+TikaVersion+".jar")
  138. ServerHost = "localhost"
  139. Port = "9998"
  140. ServerEndpoint = os.getenv(
  141. 'TIKA_SERVER_ENDPOINT', 'http://' + ServerHost + ':' + Port)
  142. Translator = os.getenv(
  143. 'TIKA_TRANSLATOR',
  144. "org.apache.tika.language.translate.Lingo24Translator")
  145. TikaClientOnly = os.getenv('TIKA_CLIENT_ONLY', False)
  146. TikaServerClasspath = os.getenv('TIKA_SERVER_CLASSPATH', '')
  147. TikaStartupSleep = float(os.getenv('TIKA_STARTUP_SLEEP', 5))
  148. TikaStartupMaxRetry = int(os.getenv('TIKA_STARTUP_MAX_RETRY', 3))
  149. TikaJava = os.getenv("TIKA_JAVA", "java")
  150. Verbose = 0
  151. EncodeUtf8 = 0
  152. csvOutput = 0
  153. class TikaException(Exception):
  154. pass
  155. def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s)))
  156. def warn(*s): echo2('Warn:', *s)
  157. def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit()
  158. def runCommand(cmd, option, urlOrPaths, port, outDir=None,
  159. serverHost=ServerHost, tikaServerJar=TikaServerJar,
  160. verbose=Verbose, encode=EncodeUtf8):
  161. '''
  162. Run the Tika command by calling the Tika server and return results in JSON format (or plain text).
  163. :param cmd: a command from set ``{'parse', 'detect', 'language', 'translate', 'config'}``
  164. :param option:
  165. :param urlOrPaths:
  166. :param port:
  167. :param outDir:
  168. :param serverHost:
  169. :param tikaServerJar:
  170. :param verbose:
  171. :param encode:
  172. :return: response for the command, usually a ``dict``
  173. '''
  174. # import pdb; pdb.set_trace()
  175. if (cmd in 'parse' or cmd in 'detect') and (urlOrPaths == [] or urlOrPaths == None):
  176. log.exception('No URLs/paths specified.')
  177. raise TikaException('No URLs/paths specified.')
  178. serverEndpoint = 'http://' + serverHost + ':' + port
  179. if cmd == 'parse':
  180. return parseAndSave(option, urlOrPaths, outDir, serverEndpoint, verbose, tikaServerJar)
  181. elif cmd == "detect":
  182. return detectType(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
  183. elif cmd == "language":
  184. return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
  185. elif cmd == "translate":
  186. return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
  187. elif cmd == "config":
  188. status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
  189. return resp
  190. else:
  191. log.exception('Bad args')
  192. raise TikaException('Bad args')
  193. def getPaths(urlOrPaths):
  194. '''
  195. Determines if the given URL in urlOrPaths is a URL or a file or directory. If it's
  196. a directory, it walks the directory and then finds all file paths in it, and ads them
  197. too. If it's a file, it adds it to the paths. If it's a URL it just adds it to the path.
  198. :param urlOrPaths: the url or path to be scanned
  199. :return: ``list`` of paths
  200. '''
  201. if isinstance(urlOrPaths, basestring):
  202. #FIXME: basestring is undefined
  203. urlOrPaths = [urlOrPaths] # do not recursively walk over letters of a single path which can include "/"
  204. paths = []
  205. for eachUrlOrPaths in urlOrPaths:
  206. if os.path.isdir(eachUrlOrPaths):
  207. for root, directories, filenames in walk(eachUrlOrPaths):
  208. for filename in filenames:
  209. paths.append(os.path.join(root,filename))
  210. else:
  211. paths.append(eachUrlOrPaths)
  212. return paths
  213. def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  214. responseMimeType='application/json', metaExtension='_meta.json',
  215. services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}):
  216. '''
  217. Parse the objects and write extracted metadata and/or text in JSON format to matching
  218. filename with an extension of '_meta.json'.
  219. :param option:
  220. :param urlOrPaths:
  221. :param outDir:
  222. :param serverEndpoint:
  223. :param verbose:
  224. :param tikaServerJar:
  225. :param responseMimeType:
  226. :param metaExtension:
  227. :param services:
  228. :return:
  229. '''
  230. metaPaths = []
  231. paths = getPaths(urlOrPaths)
  232. for path in paths:
  233. if outDir is None:
  234. metaPath = path + metaExtension
  235. else:
  236. metaPath = os.path.join(outDir, os.path.split(path)[1] + metaExtension)
  237. log.info('Writing %s' % metaPath)
  238. with open(metaPath, 'w', 'utf-8') as f:
  239. f.write(parse1(option, path, serverEndpoint, verbose, tikaServerJar, \
  240. responseMimeType, services)[1] + u"\n")
  241. metaPaths.append(metaPath)
  242. return metaPaths
  243. def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  244. responseMimeType='application/json',
  245. services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
  246. '''
  247. Parse the objects and return extracted metadata and/or text in JSON format.
  248. :param option:
  249. :param urlOrPaths:
  250. :param serverEndpoint:
  251. :param verbose:
  252. :param tikaServerJar:
  253. :param responseMimeType:
  254. :param services:
  255. :return:
  256. '''
  257. return [parse1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
  258. for path in urlOrPaths]
  259. def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  260. responseMimeType='application/json',
  261. services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None):
  262. '''
  263. Parse the object and return extracted metadata and/or text in JSON format.
  264. :param option:
  265. :param urlOrPath:
  266. :param serverEndpoint:
  267. :param verbose:
  268. :param tikaServerJar:
  269. :param responseMimeType:
  270. :param services:
  271. :param rawResponse:
  272. :param headers:
  273. :return:
  274. '''
  275. headers = headers or {}
  276. path, file_type = getRemoteFile(urlOrPath, TikaFilesPath)
  277. headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path)})
  278. if option not in services:
  279. log.warning('config option must be one of meta, text, or all; using all.')
  280. service = services.get(option, services['all'])
  281. if service == '/tika': responseMimeType = 'text/plain'
  282. status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
  283. headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
  284. if file_type == 'remote': os.unlink(path)
  285. return (status, response)
  286. def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  287. responseMimeType='text/plain',
  288. services={'file' : '/language/stream'}):
  289. '''
  290. Detect the language of the provided stream and return its 2 character code as text/plain.
  291. :param option:
  292. :param urlOrPaths:
  293. :param serverEndpoint:
  294. :param verbose:
  295. :param tikaServerJar:
  296. :param responseMimeType:
  297. :param services:
  298. :return:
  299. '''
  300. paths = getPaths(urlOrPaths)
  301. return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
  302. for path in paths]
  303. def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  304. responseMimeType='text/plain',
  305. services={'file' : '/language/stream'}):
  306. '''
  307. Detect the language of the provided stream and return its 2 character code as text/plain.
  308. :param option:
  309. :param urlOrPath:
  310. :param serverEndpoint:
  311. :param verbose:
  312. :param tikaServerJar:
  313. :param responseMimeType:
  314. :param services:
  315. :return:
  316. '''
  317. path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
  318. if option not in services:
  319. log.exception('Language option must be one of %s ' % binary_string(services.keys()))
  320. raise TikaException('Language option must be one of %s ' % binary_string(services.keys()))
  321. service = services[option]
  322. status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
  323. {'Accept': responseMimeType}, verbose, tikaServerJar)
  324. return (status, response)
  325. def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  326. responseMimeType='text/plain',
  327. services={'all': '/translate/all'}):
  328. '''
  329. Translate the file from source language to destination language.
  330. :param option:
  331. :param urlOrPaths:
  332. :param serverEndpoint:
  333. :param verbose:
  334. :param tikaServerJar:
  335. :param responseMimeType:
  336. :param services:
  337. :return:
  338. '''
  339. paths = getPaths(urlOrPaths)
  340. return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
  341. for path in paths]
  342. def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  343. responseMimeType='text/plain',
  344. services={'all': '/translate/all'}):
  345. '''
  346. :param option:
  347. :param urlOrPath:
  348. :param serverEndpoint:
  349. :param verbose:
  350. :param tikaServerJar:
  351. :param responseMimeType:
  352. :param services:
  353. :return:
  354. '''
  355. path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
  356. srcLang = ""
  357. destLang = ""
  358. if ":" in option:
  359. options = option.rsplit(':')
  360. srcLang = options[0]
  361. destLang = options[1]
  362. if len(options) != 2:
  363. log.exception('Translate options are specified as srcLang:destLang or as destLang')
  364. raise TikaException('Translate options are specified as srcLang:destLang or as destLang')
  365. else:
  366. destLang = option
  367. if srcLang != "" and destLang != "":
  368. service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
  369. else:
  370. service = services["all"] + "/" + Translator + "/" + destLang
  371. status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
  372. {'Accept' : responseMimeType},
  373. verbose, tikaServerJar)
  374. return (status, response)
  375. def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  376. responseMimeType='text/plain',
  377. services={'type': '/detect/stream'}):
  378. '''
  379. Detect the MIME/media type of the stream and return it in text/plain.
  380. :param option:
  381. :param urlOrPaths:
  382. :param serverEndpoint:
  383. :param verbose:
  384. :param tikaServerJar:
  385. :param responseMimeType:
  386. :param services:
  387. :return:
  388. '''
  389. paths = getPaths(urlOrPaths)
  390. return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
  391. for path in paths]
  392. def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
  393. responseMimeType='text/plain',
  394. services={'type': '/detect/stream'}, config_path=None):
  395. '''
  396. Detect the MIME/media type of the stream and return it in text/plain.
  397. :param option:
  398. :param urlOrPath:
  399. :param serverEndpoint:
  400. :param verbose:
  401. :param tikaServerJar:
  402. :param responseMimeType:
  403. :param services:
  404. :return:
  405. '''
  406. path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
  407. if option not in services:
  408. log.exception('Detect option must be one of %s' % binary_string(services.keys()))
  409. raise TikaException('Detect option must be one of %s' % binary_string(services.keys()))
  410. service = services[option]
  411. status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
  412. {
  413. 'Accept': responseMimeType,
  414. 'Content-Disposition': make_content_disposition_header(path)
  415. },
  416. verbose, tikaServerJar, config_path=config_path)
  417. if csvOutput == 1:
  418. return(status, urlOrPath.decode("UTF-8") + "," + response)
  419. else:
  420. return (status, response)
  421. def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json',
  422. services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}):
  423. '''
  424. Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
  425. :param option:
  426. :param serverEndpoint:
  427. :param verbose:
  428. :param tikaServerJar:
  429. :param responseMimeType:
  430. :param services:
  431. :return:
  432. '''
  433. if option not in services:
  434. die('config option must be one of mime-types, detectors, or parsers')
  435. service = services[option]
  436. status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar)
  437. return (status, response)
  438. def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
  439. httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
  440. rawResponse=False,config_path=None):
  441. '''
  442. Call the Tika Server, do some error checking, and return the response.
  443. :param verb:
  444. :param serverEndpoint:
  445. :param service:
  446. :param data:
  447. :param headers:
  448. :param verbose:
  449. :param tikaServerJar:
  450. :param httpVerbs:
  451. :param classpath:
  452. :return:
  453. '''
  454. parsedUrl = urlparse(serverEndpoint)
  455. serverHost = parsedUrl.hostname
  456. scheme = parsedUrl.scheme
  457. port = parsedUrl.port
  458. if classpath is None:
  459. classpath = TikaServerClasspath
  460. global TikaClientOnly
  461. if not TikaClientOnly:
  462. serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
  463. serviceUrl = serverEndpoint + service
  464. if verb not in httpVerbs:
  465. log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
  466. raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
  467. verbFn = httpVerbs[verb]
  468. if Windows and hasattr(data, "read"):
  469. data = data.read()
  470. encodedData = data
  471. if type(data) is unicode_string:
  472. encodedData = data.encode('utf-8')
  473. resp = verbFn(serviceUrl, encodedData, headers=headers, verify=False)
  474. if verbose:
  475. print(sys.stderr, "Request headers: ", headers)
  476. print(sys.stderr, "Response headers: ", resp.headers)
  477. if resp.status_code != 200:
  478. log.warning('Tika server returned status: %d', resp.status_code)
  479. resp.encoding = "utf-8"
  480. if rawResponse:
  481. return (resp.status_code, resp.content)
  482. else:
  483. return (resp.status_code, resp.text)
  484. def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None, config_path=None):
  485. '''
  486. Check that tika-server is running. If not, download JAR file and start it up.
  487. :param scheme: e.g. http or https
  488. :param serverHost:
  489. :param port:
  490. :param tikaServerJar:
  491. :param classpath:
  492. :return:
  493. '''
  494. if classpath is None:
  495. classpath = TikaServerClasspath
  496. urlp = urlparse(tikaServerJar)
  497. serverEndpoint = '%s://%s:%s' % (scheme, serverHost, port)
  498. jarPath = os.path.join(TikaJarPath, 'tika-server.jar')
  499. if 'localhost' in serverEndpoint or '127.0.0.1' in serverEndpoint:
  500. alreadyRunning = checkPortIsOpen(serverHost, port)
  501. if not alreadyRunning:
  502. if not os.path.isfile(jarPath) and urlp.scheme != '':
  503. getRemoteJar(tikaServerJar, jarPath)
  504. if not checkJarSig(tikaServerJar, jarPath):
  505. os.remove(jarPath)
  506. tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
  507. status = startServer(jarPath, TikaJava, serverHost, port, classpath, config_path)
  508. if not status:
  509. log.error("Failed to receive startup confirmation from startServer.")
  510. raise RuntimeError("Unable to start Tika server.")
  511. return serverEndpoint
  512. def checkJarSig(tikaServerJar, jarPath):
  513. '''
  514. Checks the signature of Jar
  515. :param tikaServerJar:
  516. :param jarPath:
  517. :return: ``True`` if the signature of the jar matches
  518. '''
  519. if not os.path.isfile(jarPath + ".md5"):
  520. getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
  521. m = hashlib.md5()
  522. with open(jarPath, 'rb') as f:
  523. binContents = f.read()
  524. m.update(binContents)
  525. with open(jarPath + ".md5", "r") as em:
  526. existingContents = em.read()
  527. return existingContents == m.hexdigest()
  528. def startServer(tikaServerJar, java_path = TikaJava, serverHost = ServerHost, port = Port, classpath=None, config_path=None):
  529. '''
  530. Starts Tika Server
  531. :param tikaServerJar: path to tika server jar
  532. :param serverHost: the host interface address to be used for binding the service
  533. :param port: the host port to be used for binding the service
  534. :param classpath: Class path value to pass to JVM
  535. :return: None
  536. '''
  537. if classpath is None:
  538. classpath = TikaServerClasspath
  539. host = "localhost"
  540. if Windows:
  541. host = "0.0.0.0"
  542. if classpath:
  543. classpath += ":" + tikaServerJar
  544. else:
  545. classpath = tikaServerJar
  546. # setup command string
  547. cmd_string = ""
  548. if not config_path:
  549. cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s &' \
  550. % (java_path, classpath, port, host)
  551. else:
  552. cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s --config %s &' \
  553. % (java_path, classpath, port, host, config_path)
  554. # Check that we can write to log path
  555. try:
  556. tika_log_file_path = os.path.join(TikaServerLogFilePath, 'tika-server.log')
  557. logFile = open(tika_log_file_path, 'w')
  558. except PermissionError as e:
  559. log.error("Unable to create tika-server.log at %s due to permission error." % (TikaServerLogFilePath))
  560. return False
  561. # Check that specified java binary is available on path
  562. try:
  563. _ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
  564. except FileNotFoundError as e:
  565. log.error("Unable to run java; is it installed?")
  566. return False
  567. # Run java with jar args
  568. cmd = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
  569. # Check logs and retry as configured
  570. try_count = 0
  571. is_started = False
  572. while try_count < TikaStartupMaxRetry:
  573. with open(tika_log_file_path, "r") as tika_log_file_tmp:
  574. # check for INFO string to confirm listening endpoint
  575. if "Started Apache Tika server at" in tika_log_file_tmp.read():
  576. is_started = True
  577. else:
  578. log.warning("Failed to see startup log message; retrying...")
  579. time.sleep(TikaStartupSleep)
  580. try_count += 1
  581. if not is_started:
  582. log.error("Tika startup log message not received after %d tries." % (TikaStartupMaxRetry))
  583. return False
  584. else:
  585. return True
  586. def toFilename(urlOrPath):
  587. value = re.sub('[^\w\s-]', '-', urlOrPath).strip().lower()
  588. return re.sub('[-\s]+', '-', value).strip("-")
  589. def getRemoteFile(urlOrPath, destPath):
  590. '''
  591. Fetches URL to local path or just returns absolute path.
  592. :param urlOrPath: resource locator, generally URL or path
  593. :param destPath: path to store the resource, usually a path on file system
  594. :return: tuple having (path, 'local'/'remote')
  595. '''
  596. urlp = urlparse(urlOrPath)
  597. if urlp.scheme == '':
  598. return (os.path.abspath(urlOrPath), 'local')
  599. elif urlp.scheme not in ('http', 'https'):
  600. return (urlOrPath, 'local')
  601. else:
  602. filename = toFilename(urlOrPath)
  603. destPath = destPath + '/' +filename
  604. log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
  605. try:
  606. urlretrieve(urlOrPath, destPath)
  607. except IOError:
  608. # monkey patch fix for SSL/Windows per Tika-Python #54
  609. # https://github.com/chrismattmann/tika-python/issues/54
  610. import ssl
  611. if hasattr(ssl, '_create_unverified_context'):
  612. ssl._create_default_https_context = ssl._create_unverified_context
  613. # delete whatever we had there
  614. if os.path.exists(destPath) and os.path.isfile(destPath):
  615. os.remove(destPath)
  616. urlretrieve(urlOrPath, destPath)
  617. return (destPath, 'remote')
  618. def getRemoteJar(urlOrPath, destPath):
  619. '''
  620. Fetches URL to local path or just return absolute path.
  621. :param urlOrPath: remote resource locator
  622. :param destPath: Path to store the resource, usually a path on file system
  623. :return: tuple having (path, 'local'/'remote')
  624. '''
  625. urlp = urlparse(urlOrPath)
  626. if urlp.scheme == '':
  627. return (os.path.abspath(urlOrPath), 'local')
  628. else:
  629. log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
  630. try:
  631. urlretrieve(urlOrPath, destPath)
  632. except IOError:
  633. # monkey patch fix for SSL/Windows per Tika-Python #54
  634. # https://github.com/chrismattmann/tika-python/issues/54
  635. import ssl
  636. if hasattr(ssl, '_create_unverified_context'):
  637. ssl._create_default_https_context = ssl._create_unverified_context
  638. # delete whatever we had there
  639. if os.path.exists(destPath) and os.path.isfile(destPath):
  640. os.remove(destPath)
  641. urlretrieve(urlOrPath, destPath)
  642. return (destPath, 'remote')
  643. def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):
  644. '''
  645. Checks if the specified port is open
  646. :param remoteServerHost: the host address
  647. :param port: port which needs to be checked
  648. :return: ``True`` if port is open, ``False`` otherwise
  649. '''
  650. remoteServerIP = socket.gethostbyname(remoteServerHost)
  651. try:
  652. sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  653. result = sock.connect_ex((remoteServerIP, int(port)))
  654. if result == 0:
  655. return True
  656. else :
  657. return False
  658. sock.close()
  659. #FIXME: the above line is unreachable
  660. except KeyboardInterrupt:
  661. print("You pressed Ctrl+C")
  662. sys.exit()
  663. except socket.gaierror:
  664. print('Hostname could not be resolved. Exiting')
  665. sys.exit()
  666. except socket.error:
  667. print("Couldn't connect to server")
  668. sys.exit()
  669. def main(argv=None):
  670. """Run Tika from command line according to USAGE."""
  671. global Verbose
  672. global EncodeUtf8
  673. global csvOutput
  674. if argv is None:
  675. argv = sys.argv
  676. if (len(argv) < 3 and not (('-h' in argv) or ('--help' in argv))):
  677. log.exception('Bad args')
  678. raise TikaException('Bad args')
  679. try:
  680. opts, argv = getopt.getopt(argv[1:], 'hi:s:o:p:v:e:c',
  681. ['help', 'install=', 'server=', 'output=', 'port=', 'verbose', 'encode', 'csv'])
  682. except getopt.GetoptError as opt_error:
  683. msg, bad_opt = opt_error
  684. log.exception("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
  685. raise TikaException("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
  686. tikaServerJar = TikaServerJar
  687. serverHost = ServerHost
  688. outDir = '.'
  689. port = Port
  690. for opt, val in opts:
  691. if opt in ('-h', '--help'): echo2(USAGE); sys.exit()
  692. elif opt in ('--install'): tikaServerJar = val
  693. elif opt in ('--server'): serverHost = val
  694. elif opt in ('-o', '--output'): outDir = val
  695. elif opt in ('--port'): port = val
  696. elif opt in ('-v', '--verbose'): Verbose = 1
  697. elif opt in ('-e', '--encode'): EncodeUtf8 = 1
  698. elif opt in ('-c', '--csv'): csvOutput = 1
  699. else:
  700. raise TikaException(USAGE)
  701. cmd = argv[0]
  702. option = argv[1]
  703. try:
  704. paths = argv[2:]
  705. except:
  706. paths = None
  707. return runCommand(cmd, option, paths, port, outDir, serverHost=serverHost, tikaServerJar=tikaServerJar, verbose=Verbose, encode=EncodeUtf8)
  708. if __name__ == '__main__':
  709. log.info("Logging on '%s'" % (log_file))
  710. resp = main(sys.argv)
  711. # Set encoding of the terminal to UTF-8
  712. if sys.version.startswith("2"):
  713. # Python 2.x
  714. out = codecs.getwriter("UTF-8")(sys.stdout)
  715. elif sys.version.startswith("3"):
  716. # Python 3.x
  717. out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
  718. if type(resp) == list:
  719. out.write('\n'.join([r[1] for r in resp]))
  720. else:
  721. out.write(resp)
  722. out.write('\n')