123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822 |
- #!/usr/bin/env python
- # encoding: utf-8
- # Licensed to the Apache Software Foundation (ASF) under one or more
- # contributor license agreements. See the NOTICE file distributed with
- # this work for additional information regarding copyright ownership.
- # The ASF licenses this file to You under the Apache License, Version 2.0
- # (the "License"); you may not use this file except in compliance with
- # the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- # Module documentation
- '''
- Tika Python module provides Python API client to Aapche Tika Server.
- **Example usage**::
- import tika
- from tika import parser
- parsed = parser.from_file('/path/to/file')
- print(parsed["metadata"])
- print(parsed["content"])
- Visit https://github.com/chrismattmann/tika-python to learn more about it.
- **Detect IANA MIME Type**::
- from tika import detector
- print(detector.from_file('/path/to/file'))
- **Detect Language**::
- from tika import language
- print(language.from_file('/path/to/file'))
- **Use Tika Translate**::
- from tika import translate
- print(translate.from_file('/path/to/file', 'srcLang', 'destLang')
- # Use auto Language detection feature
- print(translate.from_file('/path/to/file', 'destLang')
- ***Tika-Python Configuration***
- You can now use custom configuration files. See https://tika.apache.org/1.18/configuring.html
- for details on writing configuration files. Configuration is set the first time the server is started.
- To use a configuration file with a parser, or detector:
- parsed = parser.from_file('/path/to/file', config_path='/path/to/configfile')
- or:
- detected = detector.from_file('/path/to/file', config_path='/path/to/configfile')
- or:
- detected = detector.from_buffer('some buffered content', config_path='/path/to/configfile')
- '''
- USAGE = """
- tika.py [-v] [-e] [-o <outputDir>] [--server <TikaServerEndpoint>] [--install <UrlToTikaServerJar>] [--port <portNumber>] <command> <option> <urlOrPathToFile>
- tika.py parse all test.pdf test2.pdf (write output JSON metadata files for test1.pdf_meta.json and test2.pdf_meta.json)
- tika.py detect type test.pdf (returns mime-type as text/plain)
- tika.py language file french.txt (returns language e.g., fr as text/plain)
- tika.py translate fr:en french.txt (translates the file french.txt from french to english)
- tika.py config mime-types (see what mime-types the Tika Server can handle)
- A simple python and command-line client for Tika using the standalone Tika server (JAR file).
- All commands return results in JSON format by default (except text in text/plain).
- To parse docs, use:
- tika.py parse <meta | text | all> <path>
- To check the configuration of the Tika server, use:
- tika.py config <mime-types | detectors | parsers>
- Commands:
- parse = parse the input file and write a JSON doc file.ext_meta.json containing the extracted metadata, text, or both
- detect type = parse the stream and 'detect' the MIME/media type, return in text/plain
- language file = parse the file stream and identify the language of the text, return its 2 character code in text/plain
- translate src:dest = parse and extract text and then translate the text from source language to destination language
- config = return a JSON doc describing the configuration of the Tika server (i.e. mime-types it
- can handle, or installed detectors or parsers)
- Arguments:
- urlOrPathToFile = file to be parsed, if URL it will first be retrieved and then passed to Tika
-
- Switches:
- --verbose, -v = verbose mode
- --encode, -e = encode response in UTF-8
- --csv, -c = report detect output in comma-delimited format
- --server <TikaServerEndpoint> = use a remote Tika Server at this endpoint, otherwise use local server
- --install <UrlToTikaServerJar> = download and exec Tika Server (JAR file), starting server on default port 9998
- Example usage as python client:
- -- from tika import runCommand, parse1
- -- jsonOutput = runCommand('parse', 'all', filename)
- or
- -- jsonOutput = parse1('all', filename)
- """
- import sys, os, getopt, time, codecs, re
- try:
- unicode_string = unicode
- binary_string = str
- except NameError:
- unicode_string = str
- binary_string = bytes
- try:
- from urllib import urlretrieve
- except ImportError:
- from urllib.request import urlretrieve
- try:
- from urlparse import urlparse
- except ImportError:
- from urllib.parse import urlparse as urlparse
- try:
- from rfc6266 import build_header
- def make_content_disposition_header(fn):
- return build_header(os.path.basename(fn)).decode('ascii')
- except ImportError:
- def make_content_disposition_header(fn):
- return 'attachment; filename=%s' % os.path.basename(fn)
- if sys.version_info[0] < 3:
- open = codecs.open
- import requests
- import socket
- import tempfile
- import hashlib
- import platform
- from subprocess import Popen
- from subprocess import STDOUT
- from os import walk
- import logging
- log_path = os.getenv('TIKA_LOG_PATH', tempfile.gettempdir())
- log_file = os.path.join(log_path, 'tika.log')
- logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")
- log = logging.getLogger('tika.tika')
- # File logs
- fileHandler = logging.FileHandler(log_file)
- fileHandler.setFormatter(logFormatter)
- log.addHandler(fileHandler)
- # Stdout logs
- consoleHandler = logging.StreamHandler()
- consoleHandler.setFormatter(logFormatter)
- log.addHandler(consoleHandler)
- # Log level
- log.setLevel(logging.INFO)
- Windows = True if platform.system() == "Windows" else False
- TikaVersion = os.getenv('TIKA_VERSION', '1.19')
- TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
- TikaFilesPath = tempfile.gettempdir()
- TikaServerLogFilePath = log_path
- TikaServerJar = os.getenv(
- 'TIKA_SERVER_JAR',
- "http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/"+TikaVersion+"/tika-server-"+TikaVersion+".jar")
- ServerHost = "localhost"
- Port = "9998"
- ServerEndpoint = os.getenv(
- 'TIKA_SERVER_ENDPOINT', 'http://' + ServerHost + ':' + Port)
- Translator = os.getenv(
- 'TIKA_TRANSLATOR',
- "org.apache.tika.language.translate.Lingo24Translator")
- TikaClientOnly = os.getenv('TIKA_CLIENT_ONLY', False)
- TikaServerClasspath = os.getenv('TIKA_SERVER_CLASSPATH', '')
- TikaStartupSleep = float(os.getenv('TIKA_STARTUP_SLEEP', 5))
- TikaStartupMaxRetry = int(os.getenv('TIKA_STARTUP_MAX_RETRY', 3))
- TikaJava = os.getenv("TIKA_JAVA", "java")
- Verbose = 0
- EncodeUtf8 = 0
- csvOutput = 0
- class TikaException(Exception):
- pass
- def echo2(*s): sys.stderr.write(unicode_string('tika.py: %s\n') % unicode_string(' ').join(map(unicode_string, s)))
- def warn(*s): echo2('Warn:', *s)
- def die(*s): warn('Error:', *s); echo2(USAGE); sys.exit()
- def runCommand(cmd, option, urlOrPaths, port, outDir=None,
- serverHost=ServerHost, tikaServerJar=TikaServerJar,
- verbose=Verbose, encode=EncodeUtf8):
- '''
- Run the Tika command by calling the Tika server and return results in JSON format (or plain text).
- :param cmd: a command from set ``{'parse', 'detect', 'language', 'translate', 'config'}``
- :param option:
- :param urlOrPaths:
- :param port:
- :param outDir:
- :param serverHost:
- :param tikaServerJar:
- :param verbose:
- :param encode:
- :return: response for the command, usually a ``dict``
- '''
- # import pdb; pdb.set_trace()
- if (cmd in 'parse' or cmd in 'detect') and (urlOrPaths == [] or urlOrPaths == None):
- log.exception('No URLs/paths specified.')
- raise TikaException('No URLs/paths specified.')
- serverEndpoint = 'http://' + serverHost + ':' + port
- if cmd == 'parse':
- return parseAndSave(option, urlOrPaths, outDir, serverEndpoint, verbose, tikaServerJar)
- elif cmd == "detect":
- return detectType(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
- elif cmd == "language":
- return detectLang(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
- elif cmd == "translate":
- return doTranslate(option, urlOrPaths, serverEndpoint, verbose, tikaServerJar)
- elif cmd == "config":
- status, resp = getConfig(option, serverEndpoint, verbose, tikaServerJar)
- return resp
- else:
- log.exception('Bad args')
- raise TikaException('Bad args')
- def getPaths(urlOrPaths):
- '''
- Determines if the given URL in urlOrPaths is a URL or a file or directory. If it's
- a directory, it walks the directory and then finds all file paths in it, and ads them
- too. If it's a file, it adds it to the paths. If it's a URL it just adds it to the path.
- :param urlOrPaths: the url or path to be scanned
- :return: ``list`` of paths
- '''
- if isinstance(urlOrPaths, basestring):
- #FIXME: basestring is undefined
- urlOrPaths = [urlOrPaths] # do not recursively walk over letters of a single path which can include "/"
- paths = []
- for eachUrlOrPaths in urlOrPaths:
- if os.path.isdir(eachUrlOrPaths):
- for root, directories, filenames in walk(eachUrlOrPaths):
- for filename in filenames:
- paths.append(os.path.join(root,filename))
- else:
- paths.append(eachUrlOrPaths)
- return paths
- def parseAndSave(option, urlOrPaths, outDir=None, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='application/json', metaExtension='_meta.json',
- services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}):
- '''
- Parse the objects and write extracted metadata and/or text in JSON format to matching
- filename with an extension of '_meta.json'.
- :param option:
- :param urlOrPaths:
- :param outDir:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param metaExtension:
- :param services:
- :return:
- '''
- metaPaths = []
- paths = getPaths(urlOrPaths)
- for path in paths:
- if outDir is None:
- metaPath = path + metaExtension
- else:
- metaPath = os.path.join(outDir, os.path.split(path)[1] + metaExtension)
- log.info('Writing %s' % metaPath)
- with open(metaPath, 'w', 'utf-8') as f:
- f.write(parse1(option, path, serverEndpoint, verbose, tikaServerJar, \
- responseMimeType, services)[1] + u"\n")
- metaPaths.append(metaPath)
- return metaPaths
- def parse(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='application/json',
- services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta'}, rawResponse=False):
- '''
- Parse the objects and return extracted metadata and/or text in JSON format.
- :param option:
- :param urlOrPaths:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- return [parse1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
- for path in urlOrPaths]
- def parse1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='application/json',
- services={'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}, rawResponse=False, headers=None, config_path=None):
- '''
- Parse the object and return extracted metadata and/or text in JSON format.
- :param option:
- :param urlOrPath:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :param rawResponse:
- :param headers:
- :return:
- '''
- headers = headers or {}
- path, file_type = getRemoteFile(urlOrPath, TikaFilesPath)
- headers.update({'Accept': responseMimeType, 'Content-Disposition': make_content_disposition_header(path)})
- if option not in services:
- log.warning('config option must be one of meta, text, or all; using all.')
- service = services.get(option, services['all'])
- if service == '/tika': responseMimeType = 'text/plain'
- status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
- headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
- if file_type == 'remote': os.unlink(path)
- return (status, response)
- def detectLang(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='text/plain',
- services={'file' : '/language/stream'}):
- '''
- Detect the language of the provided stream and return its 2 character code as text/plain.
- :param option:
- :param urlOrPaths:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- paths = getPaths(urlOrPaths)
- return [detectLang1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
- for path in paths]
- def detectLang1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='text/plain',
- services={'file' : '/language/stream'}):
- '''
- Detect the language of the provided stream and return its 2 character code as text/plain.
- :param option:
- :param urlOrPath:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
- if option not in services:
- log.exception('Language option must be one of %s ' % binary_string(services.keys()))
- raise TikaException('Language option must be one of %s ' % binary_string(services.keys()))
- service = services[option]
- status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
- {'Accept': responseMimeType}, verbose, tikaServerJar)
- return (status, response)
- def doTranslate(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='text/plain',
- services={'all': '/translate/all'}):
- '''
- Translate the file from source language to destination language.
- :param option:
- :param urlOrPaths:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- paths = getPaths(urlOrPaths)
- return [doTranslate1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
- for path in paths]
-
- def doTranslate1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='text/plain',
- services={'all': '/translate/all'}):
- '''
- :param option:
- :param urlOrPath:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
- srcLang = ""
- destLang = ""
-
- if ":" in option:
- options = option.rsplit(':')
- srcLang = options[0]
- destLang = options[1]
- if len(options) != 2:
- log.exception('Translate options are specified as srcLang:destLang or as destLang')
- raise TikaException('Translate options are specified as srcLang:destLang or as destLang')
- else:
- destLang = option
-
- if srcLang != "" and destLang != "":
- service = services["all"] + "/" + Translator + "/" + srcLang + "/" + destLang
- else:
- service = services["all"] + "/" + Translator + "/" + destLang
- status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
- {'Accept' : responseMimeType},
- verbose, tikaServerJar)
- return (status, response)
-
- def detectType(option, urlOrPaths, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='text/plain',
- services={'type': '/detect/stream'}):
- '''
- Detect the MIME/media type of the stream and return it in text/plain.
- :param option:
- :param urlOrPaths:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- paths = getPaths(urlOrPaths)
- return [detectType1(option, path, serverEndpoint, verbose, tikaServerJar, responseMimeType, services)
- for path in paths]
- def detectType1(option, urlOrPath, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar,
- responseMimeType='text/plain',
- services={'type': '/detect/stream'}, config_path=None):
- '''
- Detect the MIME/media type of the stream and return it in text/plain.
- :param option:
- :param urlOrPath:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- path, mode = getRemoteFile(urlOrPath, TikaFilesPath)
- if option not in services:
- log.exception('Detect option must be one of %s' % binary_string(services.keys()))
- raise TikaException('Detect option must be one of %s' % binary_string(services.keys()))
- service = services[option]
- status, response = callServer('put', serverEndpoint, service, open(path, 'rb'),
- {
- 'Accept': responseMimeType,
- 'Content-Disposition': make_content_disposition_header(path)
- },
- verbose, tikaServerJar, config_path=config_path)
- if csvOutput == 1:
- return(status, urlOrPath.decode("UTF-8") + "," + response)
- else:
- return (status, response)
- def getConfig(option, serverEndpoint=ServerEndpoint, verbose=Verbose, tikaServerJar=TikaServerJar, responseMimeType='application/json',
- services={'mime-types': '/mime-types', 'detectors': '/detectors', 'parsers': '/parsers/details'}):
- '''
- Get the configuration of the Tika Server (parsers, detectors, etc.) and return it in JSON format.
- :param option:
- :param serverEndpoint:
- :param verbose:
- :param tikaServerJar:
- :param responseMimeType:
- :param services:
- :return:
- '''
- if option not in services:
- die('config option must be one of mime-types, detectors, or parsers')
- service = services[option]
- status, response = callServer('get', serverEndpoint, service, None, {'Accept': responseMimeType}, verbose, tikaServerJar)
- return (status, response)
- def callServer(verb, serverEndpoint, service, data, headers, verbose=Verbose, tikaServerJar=TikaServerJar,
- httpVerbs={'get': requests.get, 'put': requests.put, 'post': requests.post}, classpath=None,
- rawResponse=False,config_path=None):
- '''
- Call the Tika Server, do some error checking, and return the response.
- :param verb:
- :param serverEndpoint:
- :param service:
- :param data:
- :param headers:
- :param verbose:
- :param tikaServerJar:
- :param httpVerbs:
- :param classpath:
- :return:
- '''
- parsedUrl = urlparse(serverEndpoint)
- serverHost = parsedUrl.hostname
- scheme = parsedUrl.scheme
- port = parsedUrl.port
- if classpath is None:
- classpath = TikaServerClasspath
-
- global TikaClientOnly
- if not TikaClientOnly:
- serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
- serviceUrl = serverEndpoint + service
- if verb not in httpVerbs:
- log.exception('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
- raise TikaException('Tika Server call must be one of %s' % binary_string(httpVerbs.keys()))
- verbFn = httpVerbs[verb]
- if Windows and hasattr(data, "read"):
- data = data.read()
-
- encodedData = data
- if type(data) is unicode_string:
- encodedData = data.encode('utf-8')
- resp = verbFn(serviceUrl, encodedData, headers=headers, verify=False)
- if verbose:
- print(sys.stderr, "Request headers: ", headers)
- print(sys.stderr, "Response headers: ", resp.headers)
- if resp.status_code != 200:
- log.warning('Tika server returned status: %d', resp.status_code)
- resp.encoding = "utf-8"
- if rawResponse:
- return (resp.status_code, resp.content)
- else:
- return (resp.status_code, resp.text)
- def checkTikaServer(scheme="http", serverHost=ServerHost, port=Port, tikaServerJar=TikaServerJar, classpath=None, config_path=None):
- '''
- Check that tika-server is running. If not, download JAR file and start it up.
- :param scheme: e.g. http or https
- :param serverHost:
- :param port:
- :param tikaServerJar:
- :param classpath:
- :return:
- '''
- if classpath is None:
- classpath = TikaServerClasspath
- urlp = urlparse(tikaServerJar)
- serverEndpoint = '%s://%s:%s' % (scheme, serverHost, port)
- jarPath = os.path.join(TikaJarPath, 'tika-server.jar')
- if 'localhost' in serverEndpoint or '127.0.0.1' in serverEndpoint:
- alreadyRunning = checkPortIsOpen(serverHost, port)
- if not alreadyRunning:
- if not os.path.isfile(jarPath) and urlp.scheme != '':
- getRemoteJar(tikaServerJar, jarPath)
- if not checkJarSig(tikaServerJar, jarPath):
- os.remove(jarPath)
- tikaServerJar = getRemoteJar(tikaServerJar, jarPath)
- status = startServer(jarPath, TikaJava, serverHost, port, classpath, config_path)
- if not status:
- log.error("Failed to receive startup confirmation from startServer.")
- raise RuntimeError("Unable to start Tika server.")
- return serverEndpoint
- def checkJarSig(tikaServerJar, jarPath):
- '''
- Checks the signature of Jar
- :param tikaServerJar:
- :param jarPath:
- :return: ``True`` if the signature of the jar matches
- '''
- if not os.path.isfile(jarPath + ".md5"):
- getRemoteJar(tikaServerJar + ".md5", jarPath + ".md5")
- m = hashlib.md5()
- with open(jarPath, 'rb') as f:
- binContents = f.read()
- m.update(binContents)
- with open(jarPath + ".md5", "r") as em:
- existingContents = em.read()
- return existingContents == m.hexdigest()
- def startServer(tikaServerJar, java_path = TikaJava, serverHost = ServerHost, port = Port, classpath=None, config_path=None):
- '''
- Starts Tika Server
- :param tikaServerJar: path to tika server jar
- :param serverHost: the host interface address to be used for binding the service
- :param port: the host port to be used for binding the service
- :param classpath: Class path value to pass to JVM
- :return: None
- '''
- if classpath is None:
- classpath = TikaServerClasspath
- host = "localhost"
- if Windows:
- host = "0.0.0.0"
- if classpath:
- classpath += ":" + tikaServerJar
- else:
- classpath = tikaServerJar
- # setup command string
- cmd_string = ""
- if not config_path:
- cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s &' \
- % (java_path, classpath, port, host)
- else:
- cmd_string = '%s -cp %s org.apache.tika.server.TikaServerCli --port %s --host %s --config %s &' \
- % (java_path, classpath, port, host, config_path)
- # Check that we can write to log path
- try:
- tika_log_file_path = os.path.join(TikaServerLogFilePath, 'tika-server.log')
- logFile = open(tika_log_file_path, 'w')
- except PermissionError as e:
- log.error("Unable to create tika-server.log at %s due to permission error." % (TikaServerLogFilePath))
- return False
- # Check that specified java binary is available on path
- try:
- _ = Popen(java_path, stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"))
- except FileNotFoundError as e:
- log.error("Unable to run java; is it installed?")
- return False
- # Run java with jar args
- cmd = Popen(cmd_string, stdout=logFile, stderr=STDOUT, shell=True)
- # Check logs and retry as configured
- try_count = 0
- is_started = False
- while try_count < TikaStartupMaxRetry:
- with open(tika_log_file_path, "r") as tika_log_file_tmp:
- # check for INFO string to confirm listening endpoint
- if "Started Apache Tika server at" in tika_log_file_tmp.read():
- is_started = True
- else:
- log.warning("Failed to see startup log message; retrying...")
- time.sleep(TikaStartupSleep)
- try_count += 1
- if not is_started:
- log.error("Tika startup log message not received after %d tries." % (TikaStartupMaxRetry))
- return False
- else:
- return True
- def toFilename(urlOrPath):
- value = re.sub('[^\w\s-]', '-', urlOrPath).strip().lower()
- return re.sub('[-\s]+', '-', value).strip("-")
-
- def getRemoteFile(urlOrPath, destPath):
- '''
- Fetches URL to local path or just returns absolute path.
- :param urlOrPath: resource locator, generally URL or path
- :param destPath: path to store the resource, usually a path on file system
- :return: tuple having (path, 'local'/'remote')
- '''
- urlp = urlparse(urlOrPath)
- if urlp.scheme == '':
- return (os.path.abspath(urlOrPath), 'local')
- elif urlp.scheme not in ('http', 'https'):
- return (urlOrPath, 'local')
- else:
- filename = toFilename(urlOrPath)
- destPath = destPath + '/' +filename
- log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
- try:
- urlretrieve(urlOrPath, destPath)
- except IOError:
- # monkey patch fix for SSL/Windows per Tika-Python #54
- # https://github.com/chrismattmann/tika-python/issues/54
- import ssl
- if hasattr(ssl, '_create_unverified_context'):
- ssl._create_default_https_context = ssl._create_unverified_context
- # delete whatever we had there
- if os.path.exists(destPath) and os.path.isfile(destPath):
- os.remove(destPath)
- urlretrieve(urlOrPath, destPath)
- return (destPath, 'remote')
- def getRemoteJar(urlOrPath, destPath):
- '''
- Fetches URL to local path or just return absolute path.
- :param urlOrPath: remote resource locator
- :param destPath: Path to store the resource, usually a path on file system
- :return: tuple having (path, 'local'/'remote')
- '''
- urlp = urlparse(urlOrPath)
- if urlp.scheme == '':
- return (os.path.abspath(urlOrPath), 'local')
- else:
- log.info('Retrieving %s to %s.' % (urlOrPath, destPath))
- try:
- urlretrieve(urlOrPath, destPath)
- except IOError:
- # monkey patch fix for SSL/Windows per Tika-Python #54
- # https://github.com/chrismattmann/tika-python/issues/54
- import ssl
- if hasattr(ssl, '_create_unverified_context'):
- ssl._create_default_https_context = ssl._create_unverified_context
- # delete whatever we had there
- if os.path.exists(destPath) and os.path.isfile(destPath):
- os.remove(destPath)
- urlretrieve(urlOrPath, destPath)
-
- return (destPath, 'remote')
-
- def checkPortIsOpen(remoteServerHost=ServerHost, port = Port):
- '''
- Checks if the specified port is open
- :param remoteServerHost: the host address
- :param port: port which needs to be checked
- :return: ``True`` if port is open, ``False`` otherwise
- '''
- remoteServerIP = socket.gethostbyname(remoteServerHost)
- try:
- sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- result = sock.connect_ex((remoteServerIP, int(port)))
- if result == 0:
- return True
- else :
- return False
- sock.close()
- #FIXME: the above line is unreachable
- except KeyboardInterrupt:
- print("You pressed Ctrl+C")
- sys.exit()
- except socket.gaierror:
- print('Hostname could not be resolved. Exiting')
- sys.exit()
- except socket.error:
- print("Couldn't connect to server")
- sys.exit()
- def main(argv=None):
- """Run Tika from command line according to USAGE."""
- global Verbose
- global EncodeUtf8
- global csvOutput
- if argv is None:
- argv = sys.argv
- if (len(argv) < 3 and not (('-h' in argv) or ('--help' in argv))):
- log.exception('Bad args')
- raise TikaException('Bad args')
- try:
- opts, argv = getopt.getopt(argv[1:], 'hi:s:o:p:v:e:c',
- ['help', 'install=', 'server=', 'output=', 'port=', 'verbose', 'encode', 'csv'])
- except getopt.GetoptError as opt_error:
- msg, bad_opt = opt_error
- log.exception("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
- raise TikaException("%s error: Bad option: %s, %s" % (argv[0], bad_opt, msg))
- tikaServerJar = TikaServerJar
- serverHost = ServerHost
- outDir = '.'
- port = Port
- for opt, val in opts:
- if opt in ('-h', '--help'): echo2(USAGE); sys.exit()
- elif opt in ('--install'): tikaServerJar = val
- elif opt in ('--server'): serverHost = val
- elif opt in ('-o', '--output'): outDir = val
- elif opt in ('--port'): port = val
- elif opt in ('-v', '--verbose'): Verbose = 1
- elif opt in ('-e', '--encode'): EncodeUtf8 = 1
- elif opt in ('-c', '--csv'): csvOutput = 1
- else:
- raise TikaException(USAGE)
- cmd = argv[0]
- option = argv[1]
- try:
- paths = argv[2:]
- except:
- paths = None
- return runCommand(cmd, option, paths, port, outDir, serverHost=serverHost, tikaServerJar=tikaServerJar, verbose=Verbose, encode=EncodeUtf8)
- if __name__ == '__main__':
- log.info("Logging on '%s'" % (log_file))
- resp = main(sys.argv)
- # Set encoding of the terminal to UTF-8
- if sys.version.startswith("2"):
- # Python 2.x
- out = codecs.getwriter("UTF-8")(sys.stdout)
- elif sys.version.startswith("3"):
- # Python 3.x
- out = codecs.getwriter("UTF-8")(sys.stdout.buffer)
- if type(resp) == list:
- out.write('\n'.join([r[1] for r in resp]))
- else:
- out.write(resp)
- out.write('\n')
|