12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594 |
- # Natural Language Toolkit: Corpus & Model Downloader
- #
- # Copyright (C) 2001-2019 NLTK Project
- # Author: Edward Loper <edloper@gmail.com>
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- """
- The NLTK corpus and module downloader. This module defines several
- interfaces which can be used to download corpora, models, and other
- data packages that can be used with NLTK.
- Downloading Packages
- ====================
- If called with no arguments, ``download()`` will display an interactive
- interface which can be used to download and install new packages.
- If Tkinter is available, then a graphical interface will be shown,
- otherwise a simple text interface will be provided.
- Individual packages can be downloaded by calling the ``download()``
- function with a single argument, giving the package identifier for the
- package that should be downloaded:
- >>> download('treebank') # doctest: +SKIP
- [nltk_data] Downloading package 'treebank'...
- [nltk_data] Unzipping corpora/treebank.zip.
- NLTK also provides a number of \"package collections\", consisting of
- a group of related packages. To download all packages in a
- colleciton, simply call ``download()`` with the collection's
- identifier:
- >>> download('all-corpora') # doctest: +SKIP
- [nltk_data] Downloading package 'abc'...
- [nltk_data] Unzipping corpora/abc.zip.
- [nltk_data] Downloading package 'alpino'...
- [nltk_data] Unzipping corpora/alpino.zip.
- ...
- [nltk_data] Downloading package 'words'...
- [nltk_data] Unzipping corpora/words.zip.
- Download Directory
- ==================
- By default, packages are installed in either a system-wide directory
- (if Python has sufficient access to write to it); or in the current
- user's home directory. However, the ``download_dir`` argument may be
- used to specify a different installation target, if desired.
- See ``Downloader.default_download_dir()`` for more a detailed
- description of how the default download directory is chosen.
- NLTK Download Server
- ====================
- Before downloading any packages, the corpus and module downloader
- contacts the NLTK download server, to retrieve an index file
- describing the available packages. By default, this index file is
- loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``.
- If necessary, it is possible to create a new ``Downloader`` object,
- specifying a different URL for the package index file.
- Usage::
- python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
- or::
- python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS
- """
- # ----------------------------------------------------------------------
- from __future__ import print_function, division, unicode_literals
- """
- 0 1 2 3
- [label][----][label][----]
- [column ][column ]
- Notes
- =====
- Handling data files.. Some questions:
- * Should the data files be kept zipped or unzipped? I say zipped.
- * Should the data files be kept in svn at all? Advantages: history;
- automatic version numbers; 'svn up' could be used rather than the
- downloader to update the corpora. Disadvantages: they're big,
- which makes working from svn a bit of a pain. And we're planning
- to potentially make them much bigger. I don't think we want
- people to have to download 400MB corpora just to use nltk from svn.
- * Compromise: keep the data files in trunk/data rather than in
- trunk/nltk. That way you can check them out in svn if you want
- to; but you don't need to, and you can use the downloader instead.
- * Also: keep models in mind. When we change the code, we'd
- potentially like the models to get updated. This could require a
- little thought.
- * So.. let's assume we have a trunk/data directory, containing a bunch
- of packages. The packages should be kept as zip files, because we
- really shouldn't be editing them much (well -- we may edit models
- more, but they tend to be binary-ish files anyway, where diffs
- aren't that helpful). So we'll have trunk/data, with a bunch of
- files like abc.zip and treebank.zip and propbank.zip. For each
- package we could also have eg treebank.xml and propbank.xml,
- describing the contents of the package (name, copyright, license,
- etc). Collections would also have .xml files. Finally, we would
- pull all these together to form a single index.xml file. Some
- directory structure wouldn't hurt. So how about::
- /trunk/data/ ....................... root of data svn
- index.xml ........................ main index file
- src/ ............................. python scripts
- packages/ ........................ dir for packages
- corpora/ ....................... zip & xml files for corpora
- grammars/ ...................... zip & xml files for grammars
- taggers/ ....................... zip & xml files for taggers
- tokenizers/ .................... zip & xml files for tokenizers
- etc.
- collections/ ..................... xml files for collections
- Where the root (/trunk/data) would contain a makefile; and src/
- would contain a script to update the info.xml file. It could also
- contain scripts to rebuild some of the various model files. The
- script that builds index.xml should probably check that each zip
- file expands entirely into a single subdir, whose name matches the
- package's uid.
- Changes I need to make:
- - in index: change "size" to "filesize" or "compressed-size"
- - in index: add "unzipped-size"
- - when checking status: check both compressed & uncompressed size.
- uncompressed size is important to make sure we detect a problem
- if something got partially unzipped. define new status values
- to differentiate stale vs corrupt vs corruptly-uncompressed??
- (we shouldn't need to re-download the file if the zip file is ok
- but it didn't get uncompressed fully.)
- - add other fields to the index: author, license, copyright, contact,
- etc.
- the current grammars/ package would become a single new package (eg
- toy-grammars or book-grammars).
- xml file should have:
- - authorship info
- - license info
- - copyright info
- - contact info
- - info about what type of data/annotation it contains?
- - recommended corpus reader?
- collections can contain other collections. they can also contain
- multiple package types (corpora & models). Have a single 'basics'
- package that includes everything we talk about in the book?
- n.b.: there will have to be a fallback to the punkt tokenizer, in case
- they didn't download that model.
- default: unzip or not?
- """
- import time, os, zipfile, sys, textwrap, threading, itertools, shutil, functools
- import subprocess
- from hashlib import md5
- from xml.etree import ElementTree
- try:
- TKINTER = True
- from six.moves.tkinter import (
- Tk,
- Frame,
- Label,
- Entry,
- Button,
- Canvas,
- Menu,
- IntVar,
- TclError,
- )
- from six.moves.tkinter_messagebox import showerror
- from nltk.draw.table import Table
- from nltk.draw.util import ShowText
- except ImportError:
- TKINTER = False
- TclError = ValueError
- from six import string_types, text_type
- from six.moves import input
- from six.moves.urllib.request import urlopen
- from six.moves.urllib.error import HTTPError, URLError
- import nltk
- from nltk.compat import python_2_unicode_compatible
- # urllib2 = nltk.internals.import_from_stdlib('urllib2')
- ######################################################################
- # Directory entry objects (from the data server's index file)
- ######################################################################
- @python_2_unicode_compatible
- class Package(object):
- """
- A directory entry for a downloadable package. These entries are
- extracted from the XML index file that is downloaded by
- ``Downloader``. Each package consists of a single file; but if
- that file is a zip file, then it can be automatically decompressed
- when the package is installed.
- """
- def __init__(
- self,
- id,
- url,
- name=None,
- subdir='',
- size=None,
- unzipped_size=None,
- checksum=None,
- svn_revision=None,
- copyright='Unknown',
- contact='Unknown',
- license='Unknown',
- author='Unknown',
- unzip=True,
- **kw
- ):
- self.id = id
- """A unique identifier for this package."""
- self.name = name or id
- """A string name for this package."""
- self.subdir = subdir
- """The subdirectory where this package should be installed.
- E.g., ``'corpora'`` or ``'taggers'``."""
- self.url = url
- """A URL that can be used to download this package's file."""
- self.size = int(size)
- """The filesize (in bytes) of the package file."""
- self.unzipped_size = int(unzipped_size)
- """The total filesize of the files contained in the package's
- zipfile."""
- self.checksum = checksum
- """The MD-5 checksum of the package file."""
- self.svn_revision = svn_revision
- """A subversion revision number for this package."""
- self.copyright = copyright
- """Copyright holder for this package."""
- self.contact = contact
- """Name & email of the person who should be contacted with
- questions about this package."""
- self.license = license
- """License information for this package."""
- self.author = author
- """Author of this package."""
- ext = os.path.splitext(url.split('/')[-1])[1]
- self.filename = os.path.join(subdir, id + ext)
- """The filename that should be used for this package's file. It
- is formed by joining ``self.subdir`` with ``self.id``, and
- using the same extension as ``url``."""
- self.unzip = bool(int(unzip)) # '0' or '1'
- """A flag indicating whether this corpus should be unzipped by
- default."""
- # Include any other attributes provided by the XML file.
- self.__dict__.update(kw)
- @staticmethod
- def fromxml(xml):
- if isinstance(xml, string_types):
- xml = ElementTree.parse(xml)
- for key in xml.attrib:
- xml.attrib[key] = text_type(xml.attrib[key])
- return Package(**xml.attrib)
- def __lt__(self, other):
- return self.id < other.id
- def __repr__(self):
- return '<Package %s>' % self.id
- @python_2_unicode_compatible
- class Collection(object):
- """
- A directory entry for a collection of downloadable packages.
- These entries are extracted from the XML index file that is
- downloaded by ``Downloader``.
- """
- def __init__(self, id, children, name=None, **kw):
- self.id = id
- """A unique identifier for this collection."""
- self.name = name or id
- """A string name for this collection."""
- self.children = children
- """A list of the ``Collections`` or ``Packages`` directly
- contained by this collection."""
- self.packages = None
- """A list of ``Packages`` contained by this collection or any
- collections it recursively contains."""
- # Include any other attributes provided by the XML file.
- self.__dict__.update(kw)
- @staticmethod
- def fromxml(xml):
- if isinstance(xml, string_types):
- xml = ElementTree.parse(xml)
- for key in xml.attrib:
- xml.attrib[key] = text_type(xml.attrib[key])
- children = [child.get('ref') for child in xml.findall('item')]
- return Collection(children=children, **xml.attrib)
- def __lt__(self, other):
- return self.id < other.id
- def __repr__(self):
- return '<Collection %s>' % self.id
- ######################################################################
- # Message Passing Objects
- ######################################################################
- class DownloaderMessage(object):
- """A status message object, used by ``incr_download`` to
- communicate its progress."""
- class StartCollectionMessage(DownloaderMessage):
- """Data server has started working on a collection of packages."""
- def __init__(self, collection):
- self.collection = collection
- class FinishCollectionMessage(DownloaderMessage):
- """Data server has finished working on a collection of packages."""
- def __init__(self, collection):
- self.collection = collection
- class StartPackageMessage(DownloaderMessage):
- """Data server has started working on a package."""
- def __init__(self, package):
- self.package = package
- class FinishPackageMessage(DownloaderMessage):
- """Data server has finished working on a package."""
- def __init__(self, package):
- self.package = package
- class StartDownloadMessage(DownloaderMessage):
- """Data server has started downloading a package."""
- def __init__(self, package):
- self.package = package
- class FinishDownloadMessage(DownloaderMessage):
- """Data server has finished downloading a package."""
- def __init__(self, package):
- self.package = package
- class StartUnzipMessage(DownloaderMessage):
- """Data server has started unzipping a package."""
- def __init__(self, package):
- self.package = package
- class FinishUnzipMessage(DownloaderMessage):
- """Data server has finished unzipping a package."""
- def __init__(self, package):
- self.package = package
- class UpToDateMessage(DownloaderMessage):
- """The package download file is already up-to-date"""
- def __init__(self, package):
- self.package = package
- class StaleMessage(DownloaderMessage):
- """The package download file is out-of-date or corrupt"""
- def __init__(self, package):
- self.package = package
- class ErrorMessage(DownloaderMessage):
- """Data server encountered an error"""
- def __init__(self, package, message):
- self.package = package
- if isinstance(message, Exception):
- self.message = str(message)
- else:
- self.message = message
- class ProgressMessage(DownloaderMessage):
- """Indicates how much progress the data server has made"""
- def __init__(self, progress):
- self.progress = progress
- class SelectDownloadDirMessage(DownloaderMessage):
- """Indicates what download directory the data server is using"""
- def __init__(self, download_dir):
- self.download_dir = download_dir
- ######################################################################
- # NLTK Data Server
- ######################################################################
- class Downloader(object):
- """
- A class used to access the NLTK data server, which can be used to
- download corpora and other data packages.
- """
- # /////////////////////////////////////////////////////////////////
- # Configuration
- # /////////////////////////////////////////////////////////////////
- INDEX_TIMEOUT = 60 * 60 # 1 hour
- """The amount of time after which the cached copy of the data
- server index will be considered 'stale,' and will be
- re-downloaded."""
- DEFAULT_URL = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml'
- """The default URL for the NLTK data server's index. An
- alternative URL can be specified when creating a new
- ``Downloader`` object."""
- # /////////////////////////////////////////////////////////////////
- # Status Constants
- # /////////////////////////////////////////////////////////////////
- INSTALLED = 'installed'
- """A status string indicating that a package or collection is
- installed and up-to-date."""
- NOT_INSTALLED = 'not installed'
- """A status string indicating that a package or collection is
- not installed."""
- STALE = 'out of date'
- """A status string indicating that a package or collection is
- corrupt or out-of-date."""
- PARTIAL = 'partial'
- """A status string indicating that a collection is partially
- installed (i.e., only some of its packages are installed.)"""
- # /////////////////////////////////////////////////////////////////
- # Cosntructor
- # /////////////////////////////////////////////////////////////////
- def __init__(self, server_index_url=None, download_dir=None):
- self._url = server_index_url or self.DEFAULT_URL
- """The URL for the data server's index file."""
- self._collections = {}
- """Dictionary from collection identifier to ``Collection``"""
- self._packages = {}
- """Dictionary from package identifier to ``Package``"""
- self._download_dir = download_dir
- """The default directory to which packages will be downloaded."""
- self._index = None
- """The XML index file downloaded from the data server"""
- self._index_timestamp = None
- """Time at which ``self._index`` was downloaded. If it is more
- than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded."""
- self._status_cache = {}
- """Dictionary from package/collection identifier to status
- string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or
- ``PARTIAL``). Cache is used for packages only, not
- collections."""
- self._errors = None
- """Flag for telling if all packages got successfully downloaded or not."""
- # decide where we're going to save things to.
- if self._download_dir is None:
- self._download_dir = self.default_download_dir()
- # /////////////////////////////////////////////////////////////////
- # Information
- # /////////////////////////////////////////////////////////////////
- def list(
- self,
- download_dir=None,
- show_packages=True,
- show_collections=True,
- header=True,
- more_prompt=False,
- skip_installed=False,
- ):
- lines = 0 # for more_prompt
- if download_dir is None:
- download_dir = self._download_dir
- print('Using default data directory (%s)' % download_dir)
- if header:
- print('=' * (26 + len(self._url)))
- print(' Data server index for <%s>' % self._url)
- print('=' * (26 + len(self._url)))
- lines += 3 # for more_prompt
- stale = partial = False
- categories = []
- if show_packages:
- categories.append('packages')
- if show_collections:
- categories.append('collections')
- for category in categories:
- print('%s:' % category.capitalize())
- lines += 1 # for more_prompt
- for info in sorted(getattr(self, category)(), key=str):
- status = self.status(info, download_dir)
- if status == self.INSTALLED and skip_installed:
- continue
- if status == self.STALE:
- stale = True
- if status == self.PARTIAL:
- partial = True
- prefix = {
- self.INSTALLED: '*',
- self.STALE: '-',
- self.PARTIAL: 'P',
- self.NOT_INSTALLED: ' ',
- }[status]
- name = textwrap.fill(
- '-' * 27 + (info.name or info.id), 75, subsequent_indent=27 * ' '
- )[27:]
- print(' [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name))
- lines += len(name.split('\n')) # for more_prompt
- if more_prompt and lines > 20:
- user_input = input("Hit Enter to continue: ")
- if user_input.lower() in ('x', 'q'):
- return
- lines = 0
- print()
- msg = '([*] marks installed packages'
- if stale:
- msg += '; [-] marks out-of-date or corrupt packages'
- if partial:
- msg += '; [P] marks partially installed collections'
- print(textwrap.fill(msg + ')', subsequent_indent=' ', width=76))
- def packages(self):
- self._update_index()
- return self._packages.values()
- def corpora(self):
- self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == 'corpora']
- def models(self):
- self._update_index()
- return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != 'corpora']
- def collections(self):
- self._update_index()
- return self._collections.values()
- # /////////////////////////////////////////////////////////////////
- # Downloading
- # /////////////////////////////////////////////////////////////////
- def _info_or_id(self, info_or_id):
- if isinstance(info_or_id, string_types):
- return self.info(info_or_id)
- else:
- return info_or_id
- # [xx] When during downloading is it 'safe' to abort? Only unsafe
- # time is *during* an unzip -- we don't want to leave a
- # partially-unzipped corpus in place because we wouldn't notice
- # it. But if we had the exact total size of the unzipped corpus,
- # then that would be fine. Then we could abort anytime we want!
- # So this is really what we should do. That way the threaded
- # downloader in the gui can just kill the download thread anytime
- # it wants.
- def incr_download(self, info_or_id, download_dir=None, force=False):
- # If they didn't specify a download_dir, then use the default one.
- if download_dir is None:
- download_dir = self._download_dir
- yield SelectDownloadDirMessage(download_dir)
- # If they gave us a list of ids, then download each one.
- if isinstance(info_or_id, (list, tuple)):
- for msg in self._download_list(info_or_id, download_dir, force):
- yield msg
- return
- # Look up the requested collection or package.
- try:
- info = self._info_or_id(info_or_id)
- except (IOError, ValueError) as e:
- yield ErrorMessage(None, 'Error loading %s: %s' % (info_or_id, e))
- return
- # Handle collections.
- if isinstance(info, Collection):
- yield StartCollectionMessage(info)
- for msg in self.incr_download(info.children, download_dir, force):
- yield msg
- yield FinishCollectionMessage(info)
- # Handle Packages (delegate to a helper function).
- else:
- for msg in self._download_package(info, download_dir, force):
- yield msg
- def _num_packages(self, item):
- if isinstance(item, Package):
- return 1
- else:
- return len(item.packages)
- def _download_list(self, items, download_dir, force):
- # Look up the requested items.
- for i in range(len(items)):
- try:
- items[i] = self._info_or_id(items[i])
- except (IOError, ValueError) as e:
- yield ErrorMessage(items[i], e)
- return
- # Download each item, re-scaling their progress.
- num_packages = sum(self._num_packages(item) for item in items)
- progress = 0
- for i, item in enumerate(items):
- if isinstance(item, Package):
- delta = 1.0 / num_packages
- else:
- delta = len(item.packages) / num_packages
- for msg in self.incr_download(item, download_dir, force):
- if isinstance(msg, ProgressMessage):
- yield ProgressMessage(progress + msg.progress * delta)
- else:
- yield msg
- progress += 100 * delta
- def _download_package(self, info, download_dir, force):
- yield StartPackageMessage(info)
- yield ProgressMessage(0)
- # Do we already have the current version?
- status = self.status(info, download_dir)
- if not force and status == self.INSTALLED:
- yield UpToDateMessage(info)
- yield ProgressMessage(100)
- yield FinishPackageMessage(info)
- return
- # Remove the package from our status cache
- self._status_cache.pop(info.id, None)
- # Check for (and remove) any old/stale version.
- filepath = os.path.join(download_dir, info.filename)
- if os.path.exists(filepath):
- if status == self.STALE:
- yield StaleMessage(info)
- os.remove(filepath)
- # Ensure the download_dir exists
- if not os.path.exists(download_dir):
- os.mkdir(download_dir)
- if not os.path.exists(os.path.join(download_dir, info.subdir)):
- os.mkdir(os.path.join(download_dir, info.subdir))
- # Download the file. This will raise an IOError if the url
- # is not found.
- yield StartDownloadMessage(info)
- yield ProgressMessage(5)
- try:
- infile = urlopen(info.url)
- with open(filepath, 'wb') as outfile:
- # print info.size
- num_blocks = max(1, info.size / (1024 * 16))
- for block in itertools.count():
- s = infile.read(1024 * 16) # 16k blocks.
- outfile.write(s)
- if not s:
- break
- if block % 2 == 0: # how often?
- yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks)))
- infile.close()
- except IOError as e:
- yield ErrorMessage(
- info,
- 'Error downloading %r from <%s>:' '\n %s' % (info.id, info.url, e),
- )
- return
- yield FinishDownloadMessage(info)
- yield ProgressMessage(80)
- # If it's a zipfile, uncompress it.
- if info.filename.endswith('.zip'):
- zipdir = os.path.join(download_dir, info.subdir)
- # Unzip if we're unzipping by default; *or* if it's already
- # been unzipped (presumably a previous version).
- if info.unzip or os.path.exists(os.path.join(zipdir, info.id)):
- yield StartUnzipMessage(info)
- for msg in _unzip_iter(filepath, zipdir, verbose=False):
- # Somewhat of a hack, but we need a proper package reference
- msg.package = info
- yield msg
- yield FinishUnzipMessage(info)
- yield FinishPackageMessage(info)
- def download(
- self,
- info_or_id=None,
- download_dir=None,
- quiet=False,
- force=False,
- prefix='[nltk_data] ',
- halt_on_error=True,
- raise_on_error=False,
- print_error_to=sys.stderr,
- ):
- print_to = functools.partial(print, file=print_error_to)
- # If no info or id is given, then use the interactive shell.
- if info_or_id is None:
- # [xx] hmm -- changing self._download_dir here seems like
- # the wrong thing to do. Maybe the _interactive_download
- # function should make a new copy of self to use?
- if download_dir is not None:
- self._download_dir = download_dir
- self._interactive_download()
- return True
- else:
- # Define a helper function for displaying output:
- def show(s, prefix2=''):
- print_to(
- textwrap.fill(
- s,
- initial_indent=prefix + prefix2,
- subsequent_indent=prefix + prefix2 + ' ' * 4,
- )
- )
- for msg in self.incr_download(info_or_id, download_dir, force):
- # Error messages
- if isinstance(msg, ErrorMessage):
- show(msg.message)
- if raise_on_error:
- raise ValueError(msg.message)
- if halt_on_error:
- return False
- self._errors = True
- if not quiet:
- print_to("Error installing package. Retry? [n/y/e]")
- choice = input().strip()
- if choice in ['y', 'Y']:
- if not self.download(
- msg.package.id,
- download_dir,
- quiet,
- force,
- prefix,
- halt_on_error,
- raise_on_error,
- ):
- return False
- elif choice in ['e', 'E']:
- return False
- # All other messages
- if not quiet:
- # Collection downloading messages:
- if isinstance(msg, StartCollectionMessage):
- show('Downloading collection %r' % msg.collection.id)
- prefix += ' | '
- print_to(prefix)
- elif isinstance(msg, FinishCollectionMessage):
- print_to(prefix)
- prefix = prefix[:-4]
- if self._errors:
- show(
- 'Downloaded collection %r with errors'
- % msg.collection.id
- )
- else:
- show('Done downloading collection %s' % msg.collection.id)
- # Package downloading messages:
- elif isinstance(msg, StartPackageMessage):
- show(
- 'Downloading package %s to %s...'
- % (msg.package.id, download_dir)
- )
- elif isinstance(msg, UpToDateMessage):
- show('Package %s is already up-to-date!' % msg.package.id, ' ')
- # elif isinstance(msg, StaleMessage):
- # show('Package %s is out-of-date or corrupt' %
- # msg.package.id, ' ')
- elif isinstance(msg, StartUnzipMessage):
- show('Unzipping %s.' % msg.package.filename, ' ')
- # Data directory message:
- elif isinstance(msg, SelectDownloadDirMessage):
- download_dir = msg.download_dir
- return True
- def is_stale(self, info_or_id, download_dir=None):
- return self.status(info_or_id, download_dir) == self.STALE
- def is_installed(self, info_or_id, download_dir=None):
- return self.status(info_or_id, download_dir) == self.INSTALLED
- def clear_status_cache(self, id=None):
- if id is None:
- self._status_cache.clear()
- else:
- self._status_cache.pop(id, None)
- def status(self, info_or_id, download_dir=None):
- """
- Return a constant describing the status of the given package
- or collection. Status can be one of ``INSTALLED``,
- ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``.
- """
- if download_dir is None:
- download_dir = self._download_dir
- info = self._info_or_id(info_or_id)
- # Handle collections:
- if isinstance(info, Collection):
- pkg_status = [self.status(pkg.id) for pkg in info.packages]
- if self.STALE in pkg_status:
- return self.STALE
- elif self.PARTIAL in pkg_status:
- return self.PARTIAL
- elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status:
- return self.PARTIAL
- elif self.NOT_INSTALLED in pkg_status:
- return self.NOT_INSTALLED
- else:
- return self.INSTALLED
- # Handle packages:
- else:
- filepath = os.path.join(download_dir, info.filename)
- if download_dir != self._download_dir:
- return self._pkg_status(info, filepath)
- else:
- if info.id not in self._status_cache:
- self._status_cache[info.id] = self._pkg_status(info, filepath)
- return self._status_cache[info.id]
- def _pkg_status(self, info, filepath):
- if not os.path.exists(filepath):
- return self.NOT_INSTALLED
- # Check if the file has the correct size.
- try:
- filestat = os.stat(filepath)
- except OSError:
- return self.NOT_INSTALLED
- if filestat.st_size != int(info.size):
- return self.STALE
- # Check if the file's checksum matches
- if md5_hexdigest(filepath) != info.checksum:
- return self.STALE
- # If it's a zipfile, and it's been at least partially
- # unzipped, then check if it's been fully unzipped.
- if filepath.endswith('.zip'):
- unzipdir = filepath[:-4]
- if not os.path.exists(unzipdir):
- return self.INSTALLED # but not unzipped -- ok!
- if not os.path.isdir(unzipdir):
- return self.STALE
- unzipped_size = sum(
- os.stat(os.path.join(d, f)).st_size
- for d, _, files in os.walk(unzipdir)
- for f in files
- )
- if unzipped_size != info.unzipped_size:
- return self.STALE
- # Otherwise, everything looks good.
- return self.INSTALLED
- def update(self, quiet=False, prefix='[nltk_data] '):
- """
- Re-download any packages whose status is STALE.
- """
- self.clear_status_cache()
- for pkg in self.packages():
- if self.status(pkg) == self.STALE:
- self.download(pkg, quiet=quiet, prefix=prefix)
- # /////////////////////////////////////////////////////////////////
- # Index
- # /////////////////////////////////////////////////////////////////
- def _update_index(self, url=None):
- """A helper function that ensures that self._index is
- up-to-date. If the index is older than self.INDEX_TIMEOUT,
- then download it again."""
- # Check if the index is aleady up-to-date. If so, do nothing.
- if not (
- self._index is None
- or url is not None
- or time.time() - self._index_timestamp > self.INDEX_TIMEOUT
- ):
- return
- # If a URL was specified, then update our URL.
- self._url = url or self._url
- # Download the index file.
- self._index = nltk.internals.ElementWrapper(
- ElementTree.parse(urlopen(self._url)).getroot()
- )
- self._index_timestamp = time.time()
- # Build a dictionary of packages.
- packages = [Package.fromxml(p) for p in self._index.findall('packages/package')]
- self._packages = dict((p.id, p) for p in packages)
- # Build a dictionary of collections.
- collections = [
- Collection.fromxml(c) for c in self._index.findall('collections/collection')
- ]
- self._collections = dict((c.id, c) for c in collections)
- # Replace identifiers with actual children in collection.children.
- for collection in self._collections.values():
- for i, child_id in enumerate(collection.children):
- if child_id in self._packages:
- collection.children[i] = self._packages[child_id]
- elif child_id in self._collections:
- collection.children[i] = self._collections[child_id]
- else:
- print(
- 'removing collection member with no package: {}'.format(
- child_id
- )
- )
- del collection.children[i]
- # Fill in collection.packages for each collection.
- for collection in self._collections.values():
- packages = {}
- queue = [collection]
- for child in queue:
- if isinstance(child, Collection):
- queue.extend(child.children)
- elif isinstance(child, Package):
- packages[child.id] = child
- else:
- pass
- collection.packages = packages.values()
- # Flush the status cache
- self._status_cache.clear()
- def index(self):
- """
- Return the XML index describing the packages available from
- the data server. If necessary, this index will be downloaded
- from the data server.
- """
- self._update_index()
- return self._index
- def info(self, id):
- """Return the ``Package`` or ``Collection`` record for the
- given item."""
- self._update_index()
- if id in self._packages:
- return self._packages[id]
- if id in self._collections:
- return self._collections[id]
- raise ValueError('Package %r not found in index' % id)
- def xmlinfo(self, id):
- """Return the XML info record for the given item"""
- self._update_index()
- for package in self._index.findall('packages/package'):
- if package.get('id') == id:
- return package
- for collection in self._index.findall('collections/collection'):
- if collection.get('id') == id:
- return collection
- raise ValueError('Package %r not found in index' % id)
- # /////////////////////////////////////////////////////////////////
- # URL & Data Directory
- # /////////////////////////////////////////////////////////////////
- def _get_url(self):
- """The URL for the data server's index file."""
- return self._url
- def _set_url(self, url):
- """
- Set a new URL for the data server. If we're unable to contact
- the given url, then the original url is kept.
- """
- original_url = self._url
- try:
- self._update_index(url)
- except:
- self._url = original_url
- raise
- url = property(_get_url, _set_url)
- def default_download_dir(self):
- """
- Return the directory to which packages will be downloaded by
- default. This value can be overridden using the constructor,
- or on a case-by-case basis using the ``download_dir`` argument when
- calling ``download()``.
- On Windows, the default download directory is
- ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the
- directory containing Python, e.g. ``C:\\Python25``.
- On all other platforms, the default directory is the first of
- the following which exists or which can be created with write
- permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``,
- ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``.
- """
- # Check if we are on GAE where we cannot write into filesystem.
- if 'APPENGINE_RUNTIME' in os.environ:
- return
- # Check if we have sufficient permissions to install in a
- # variety of system-wide locations.
- for nltkdir in nltk.data.path:
- if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir):
- return nltkdir
- # On Windows, use %APPDATA%
- if sys.platform == 'win32' and 'APPDATA' in os.environ:
- homedir = os.environ['APPDATA']
- # Otherwise, install in the user's home directory.
- else:
- homedir = os.path.expanduser('~/')
- if homedir == '~/':
- raise ValueError("Could not find a default download directory")
- # append "nltk_data" to the home directory
- return os.path.join(homedir, 'nltk_data')
- def _get_download_dir(self):
- """
- The default directory to which packages will be downloaded.
- This defaults to the value returned by ``default_download_dir()``.
- To override this default on a case-by-case basis, use the
- ``download_dir`` argument when calling ``download()``.
- """
- return self._download_dir
- def _set_download_dir(self, download_dir):
- self._download_dir = download_dir
- # Clear the status cache.
- self._status_cache.clear()
- download_dir = property(_get_download_dir, _set_download_dir)
- # /////////////////////////////////////////////////////////////////
- # Interactive Shell
- # /////////////////////////////////////////////////////////////////
- def _interactive_download(self):
- # Try the GUI first; if that doesn't work, try the simple
- # interactive shell.
- if TKINTER:
- try:
- DownloaderGUI(self).mainloop()
- except TclError:
- DownloaderShell(self).run()
- else:
- DownloaderShell(self).run()
- class DownloaderShell(object):
- def __init__(self, dataserver):
- self._ds = dataserver
- def _simple_interactive_menu(self, *options):
- print('-' * 75)
- spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * ' '
- print(' ' + spc.join(options))
- # w = 76/len(options)
- # fmt = ' ' + ('%-'+str(w)+'s')*(len(options)-1) + '%s'
- # print fmt % options
- print('-' * 75)
- def run(self):
- print('NLTK Downloader')
- while True:
- self._simple_interactive_menu(
- 'd) Download',
- 'l) List',
- ' u) Update',
- 'c) Config',
- 'h) Help',
- 'q) Quit',
- )
- user_input = input('Downloader> ').strip()
- if not user_input:
- print()
- continue
- command = user_input.lower().split()[0]
- args = user_input.split()[1:]
- try:
- if command == 'l':
- print()
- self._ds.list(self._ds.download_dir, header=False, more_prompt=True)
- elif command == 'h':
- self._simple_interactive_help()
- elif command == 'c':
- self._simple_interactive_config()
- elif command in ('q', 'x'):
- return
- elif command == 'd':
- self._simple_interactive_download(args)
- elif command == 'u':
- self._simple_interactive_update()
- else:
- print('Command %r unrecognized' % user_input)
- except HTTPError as e:
- print('Error reading from server: %s' % e)
- except URLError as e:
- print('Error connecting to server: %s' % e.reason)
- # try checking if user_input is a package name, &
- # downloading it?
- print()
- def _simple_interactive_download(self, args):
- if args:
- for arg in args:
- try:
- self._ds.download(arg, prefix=' ')
- except (IOError, ValueError) as e:
- print(e)
- else:
- while True:
- print()
- print('Download which package (l=list; x=cancel)?')
- user_input = input(' Identifier> ')
- if user_input.lower() == 'l':
- self._ds.list(
- self._ds.download_dir,
- header=False,
- more_prompt=True,
- skip_installed=True,
- )
- continue
- elif user_input.lower() in ('x', 'q', ''):
- return
- elif user_input:
- for id in user_input.split():
- try:
- self._ds.download(id, prefix=' ')
- except (IOError, ValueError) as e:
- print(e)
- break
- def _simple_interactive_update(self):
- while True:
- stale_packages = []
- stale = partial = False
- for info in sorted(getattr(self._ds, 'packages')(), key=str):
- if self._ds.status(info) == self._ds.STALE:
- stale_packages.append((info.id, info.name))
- print()
- if stale_packages:
- print('Will update following packages (o=ok; x=cancel)')
- for pid, pname in stale_packages:
- name = textwrap.fill(
- '-' * 27 + (pname), 75, subsequent_indent=27 * ' '
- )[27:]
- print(' [ ] %s %s' % (pid.ljust(20, '.'), name))
- print()
- user_input = input(' Identifier> ')
- if user_input.lower() == 'o':
- for pid, pname in stale_packages:
- try:
- self._ds.download(pid, prefix=' ')
- except (IOError, ValueError) as e:
- print(e)
- break
- elif user_input.lower() in ('x', 'q', ''):
- return
- else:
- print('Nothing to update.')
- return
- def _simple_interactive_help(self):
- print()
- print('Commands:')
- print(
- ' d) Download a package or collection u) Update out of date packages'
- )
- print(' l) List packages & collections h) Help')
- print(' c) View & Modify Configuration q) Quit')
- def _show_config(self):
- print()
- print('Data Server:')
- print(' - URL: <%s>' % self._ds.url)
- print((' - %d Package Collections Available' % len(self._ds.collections())))
- print((' - %d Individual Packages Available' % len(self._ds.packages())))
- print()
- print('Local Machine:')
- print(' - Data directory: %s' % self._ds.download_dir)
- def _simple_interactive_config(self):
- self._show_config()
- while True:
- print()
- self._simple_interactive_menu(
- 's) Show Config', 'u) Set Server URL', 'd) Set Data Dir', 'm) Main Menu'
- )
- user_input = input('Config> ').strip().lower()
- if user_input == 's':
- self._show_config()
- elif user_input == 'd':
- new_dl_dir = input(' New Directory> ').strip()
- if new_dl_dir in ('', 'x', 'q', 'X', 'Q'):
- print(' Cancelled!')
- elif os.path.isdir(new_dl_dir):
- self._ds.download_dir = new_dl_dir
- else:
- print(('Directory %r not found! Create it first.' % new_dl_dir))
- elif user_input == 'u':
- new_url = input(' New URL> ').strip()
- if new_url in ('', 'x', 'q', 'X', 'Q'):
- print(' Cancelled!')
- else:
- if not new_url.startswith(('http://', 'https://')):
- new_url = 'http://' + new_url
- try:
- self._ds.url = new_url
- except Exception as e:
- print('Error reading <%r>:\n %s' % (new_url, e))
- elif user_input == 'm':
- break
- class DownloaderGUI(object):
- """
- Graphical interface for downloading packages from the NLTK data
- server.
- """
- # /////////////////////////////////////////////////////////////////
- # Column Configuration
- # /////////////////////////////////////////////////////////////////
- COLUMNS = [
- '',
- 'Identifier',
- 'Name',
- 'Size',
- 'Status',
- 'Unzipped Size',
- 'Copyright',
- 'Contact',
- 'License',
- 'Author',
- 'Subdir',
- 'Checksum',
- ]
- """A list of the names of columns. This controls the order in
- which the columns will appear. If this is edited, then
- ``_package_to_columns()`` may need to be edited to match."""
- COLUMN_WEIGHTS = {'': 0, 'Name': 5, 'Size': 0, 'Status': 0}
- """A dictionary specifying how columns should be resized when the
- table is resized. Columns with weight 0 will not be resized at
- all; and columns with high weight will be resized more.
- Default weight (for columns not explicitly listed) is 1."""
- COLUMN_WIDTHS = {
- '': 1,
- 'Identifier': 20,
- 'Name': 45,
- 'Size': 10,
- 'Unzipped Size': 10,
- 'Status': 12,
- }
- """A dictionary specifying how wide each column should be, in
- characters. The default width (for columns not explicitly
- listed) is specified by ``DEFAULT_COLUMN_WIDTH``."""
- DEFAULT_COLUMN_WIDTH = 30
- """The default width for columns that are not explicitly listed
- in ``COLUMN_WIDTHS``."""
- INITIAL_COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status']
- """The set of columns that should be displayed by default."""
- # Perform a few import-time sanity checks to make sure that the
- # column configuration variables are defined consistently:
- for c in COLUMN_WEIGHTS:
- assert c in COLUMNS
- for c in COLUMN_WIDTHS:
- assert c in COLUMNS
- for c in INITIAL_COLUMNS:
- assert c in COLUMNS
- # /////////////////////////////////////////////////////////////////
- # Color Configuration
- # /////////////////////////////////////////////////////////////////
- _BACKDROP_COLOR = ('#000', '#ccc')
- _ROW_COLOR = {
- Downloader.INSTALLED: ('#afa', '#080'),
- Downloader.PARTIAL: ('#ffa', '#880'),
- Downloader.STALE: ('#faa', '#800'),
- Downloader.NOT_INSTALLED: ('#fff', '#888'),
- }
- _MARK_COLOR = ('#000', '#ccc')
- # _FRONT_TAB_COLOR = ('#ccf', '#008')
- # _BACK_TAB_COLOR = ('#88a', '#448')
- _FRONT_TAB_COLOR = ('#fff', '#45c')
- _BACK_TAB_COLOR = ('#aaa', '#67a')
- _PROGRESS_COLOR = ('#f00', '#aaa')
- _TAB_FONT = 'helvetica -16 bold'
- # /////////////////////////////////////////////////////////////////
- # Constructor
- # /////////////////////////////////////////////////////////////////
- def __init__(self, dataserver, use_threads=True):
- self._ds = dataserver
- self._use_threads = use_threads
- # For the threaded downloader:
- self._download_lock = threading.Lock()
- self._download_msg_queue = []
- self._download_abort_queue = []
- self._downloading = False
- # For tkinter after callbacks:
- self._afterid = {}
- # A message log.
- self._log_messages = []
- self._log_indent = 0
- self._log('NLTK Downloader Started!')
- # Create the main window.
- top = self.top = Tk()
- top.geometry('+50+50')
- top.title('NLTK Downloader')
- top.configure(background=self._BACKDROP_COLOR[1])
- # Set up some bindings now, in case anything goes wrong.
- top.bind('<Control-q>', self.destroy)
- top.bind('<Control-x>', self.destroy)
- self._destroyed = False
- self._column_vars = {}
- # Initialize the GUI.
- self._init_widgets()
- self._init_menu()
- try:
- self._fill_table()
- except HTTPError as e:
- showerror('Error reading from server', e)
- except URLError as e:
- showerror('Error connecting to server', e.reason)
- self._show_info()
- self._select_columns()
- self._table.select(0)
- # Make sure we get notified when we're destroyed, so we can
- # cancel any download in progress.
- self._table.bind('<Destroy>', self._destroy)
- def _log(self, msg):
- self._log_messages.append(
- '%s %s%s' % (time.ctime(), ' | ' * self._log_indent, msg)
- )
- # /////////////////////////////////////////////////////////////////
- # Internals
- # /////////////////////////////////////////////////////////////////
- def _init_widgets(self):
- # Create the top-level frame structures
- f1 = Frame(self.top, relief='raised', border=2, padx=8, pady=0)
- f1.pack(sid='top', expand=True, fill='both')
- f1.grid_rowconfigure(2, weight=1)
- f1.grid_columnconfigure(0, weight=1)
- Frame(f1, height=8).grid(column=0, row=0) # spacer
- tabframe = Frame(f1)
- tabframe.grid(column=0, row=1, sticky='news')
- tableframe = Frame(f1)
- tableframe.grid(column=0, row=2, sticky='news')
- buttonframe = Frame(f1)
- buttonframe.grid(column=0, row=3, sticky='news')
- Frame(f1, height=8).grid(column=0, row=4) # spacer
- infoframe = Frame(f1)
- infoframe.grid(column=0, row=5, sticky='news')
- Frame(f1, height=8).grid(column=0, row=6) # spacer
- progressframe = Frame(
- self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1]
- )
- progressframe.pack(side='bottom', fill='x')
- self.top['border'] = 0
- self.top['highlightthickness'] = 0
- # Create the tabs
- self._tab_names = ['Collections', 'Corpora', 'Models', 'All Packages']
- self._tabs = {}
- for i, tab in enumerate(self._tab_names):
- label = Label(tabframe, text=tab, font=self._TAB_FONT)
- label.pack(side='left', padx=((i + 1) % 2) * 10)
- label.bind('<Button-1>', self._select_tab)
- self._tabs[tab.lower()] = label
- # Create the table.
- column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS]
- self._table = Table(
- tableframe,
- self.COLUMNS,
- column_weights=column_weights,
- highlightthickness=0,
- listbox_height=16,
- reprfunc=self._table_reprfunc,
- )
- self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked
- for i, column in enumerate(self.COLUMNS):
- width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH)
- self._table.columnconfig(i, width=width)
- self._table.pack(expand=True, fill='both')
- self._table.focus()
- self._table.bind_to_listboxes('<Double-Button-1>', self._download)
- self._table.bind('<space>', self._table_mark)
- self._table.bind('<Return>', self._download)
- self._table.bind('<Left>', self._prev_tab)
- self._table.bind('<Right>', self._next_tab)
- self._table.bind('<Control-a>', self._mark_all)
- # Create entry boxes for URL & download_dir
- infoframe.grid_columnconfigure(1, weight=1)
- info = [
- ('url', 'Server Index:', self._set_url),
- ('download_dir', 'Download Directory:', self._set_download_dir),
- ]
- self._info = {}
- for (i, (key, label, callback)) in enumerate(info):
- Label(infoframe, text=label).grid(column=0, row=i, sticky='e')
- entry = Entry(
- infoframe, font='courier', relief='groove', disabledforeground='black'
- )
- self._info[key] = (entry, callback)
- entry.bind('<Return>', self._info_save)
- entry.bind('<Button-1>', lambda e, key=key: self._info_edit(key))
- entry.grid(column=1, row=i, sticky='ew')
- # If the user edits url or download_dir, and then clicks outside
- # the entry box, then save their results.
- self.top.bind('<Button-1>', self._info_save)
- # Create Download & Refresh buttons.
- self._download_button = Button(
- buttonframe, text='Download', command=self._download, width=8
- )
- self._download_button.pack(side='left')
- self._refresh_button = Button(
- buttonframe, text='Refresh', command=self._refresh, width=8
- )
- self._refresh_button.pack(side='right')
- # Create Progress bar
- self._progresslabel = Label(
- progressframe,
- text='',
- foreground=self._BACKDROP_COLOR[0],
- background=self._BACKDROP_COLOR[1],
- )
- self._progressbar = Canvas(
- progressframe,
- width=200,
- height=16,
- background=self._PROGRESS_COLOR[1],
- relief='sunken',
- border=1,
- )
- self._init_progressbar()
- self._progressbar.pack(side='right')
- self._progresslabel.pack(side='left')
- def _init_menu(self):
- menubar = Menu(self.top)
- filemenu = Menu(menubar, tearoff=0)
- filemenu.add_command(
- label='Download', underline=0, command=self._download, accelerator='Return'
- )
- filemenu.add_separator()
- filemenu.add_command(
- label='Change Server Index',
- underline=7,
- command=lambda: self._info_edit('url'),
- )
- filemenu.add_command(
- label='Change Download Directory',
- underline=0,
- command=lambda: self._info_edit('download_dir'),
- )
- filemenu.add_separator()
- filemenu.add_command(label='Show Log', underline=5, command=self._show_log)
- filemenu.add_separator()
- filemenu.add_command(
- label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x'
- )
- menubar.add_cascade(label='File', underline=0, menu=filemenu)
- # Create a menu to control which columns of the table are
- # shown. n.b.: we never hide the first two columns (mark and
- # identifier).
- viewmenu = Menu(menubar, tearoff=0)
- for column in self._table.column_names[2:]:
- var = IntVar(self.top)
- assert column not in self._column_vars
- self._column_vars[column] = var
- if column in self.INITIAL_COLUMNS:
- var.set(1)
- viewmenu.add_checkbutton(
- label=column, underline=0, variable=var, command=self._select_columns
- )
- menubar.add_cascade(label='View', underline=0, menu=viewmenu)
- # Create a sort menu
- # [xx] this should be selectbuttons; and it should include
- # reversed sorts as options.
- sortmenu = Menu(menubar, tearoff=0)
- for column in self._table.column_names[1:]:
- sortmenu.add_command(
- label='Sort by %s' % column,
- command=(lambda c=column: self._table.sort_by(c, 'ascending')),
- )
- sortmenu.add_separator()
- # sortmenu.add_command(label='Descending Sort:')
- for column in self._table.column_names[1:]:
- sortmenu.add_command(
- label='Reverse sort by %s' % column,
- command=(lambda c=column: self._table.sort_by(c, 'descending')),
- )
- menubar.add_cascade(label='Sort', underline=0, menu=sortmenu)
- helpmenu = Menu(menubar, tearoff=0)
- helpmenu.add_command(label='About', underline=0, command=self.about)
- helpmenu.add_command(
- label='Instructions', underline=0, command=self.help, accelerator='F1'
- )
- menubar.add_cascade(label='Help', underline=0, menu=helpmenu)
- self.top.bind('<F1>', self.help)
- self.top.config(menu=menubar)
- def _select_columns(self):
- for (column, var) in self._column_vars.items():
- if var.get():
- self._table.show_column(column)
- else:
- self._table.hide_column(column)
- def _refresh(self):
- self._ds.clear_status_cache()
- try:
- self._fill_table()
- except HTTPError as e:
- showerror('Error reading from server', e)
- except URLError as e:
- showerror('Error connecting to server', e.reason)
- self._table.select(0)
- def _info_edit(self, info_key):
- self._info_save() # just in case.
- (entry, callback) = self._info[info_key]
- entry['state'] = 'normal'
- entry['relief'] = 'sunken'
- entry.focus()
- def _info_save(self, e=None):
- focus = self._table
- for entry, callback in self._info.values():
- if entry['state'] == 'disabled':
- continue
- if e is not None and e.widget is entry and e.keysym != 'Return':
- focus = entry
- else:
- entry['state'] = 'disabled'
- entry['relief'] = 'groove'
- callback(entry.get())
- focus.focus()
- def _table_reprfunc(self, row, col, val):
- if self._table.column_names[col].endswith('Size'):
- if isinstance(val, string_types):
- return ' %s' % val
- elif val < 1024 ** 2:
- return ' %.1f KB' % (val / 1024.0 ** 1)
- elif val < 1024 ** 3:
- return ' %.1f MB' % (val / 1024.0 ** 2)
- else:
- return ' %.1f GB' % (val / 1024.0 ** 3)
- if col in (0, ''):
- return str(val)
- else:
- return ' %s' % val
- def _set_url(self, url):
- if url == self._ds.url:
- return
- try:
- self._ds.url = url
- self._fill_table()
- except IOError as e:
- showerror('Error Setting Server Index', str(e))
- self._show_info()
- def _set_download_dir(self, download_dir):
- if self._ds.download_dir == download_dir:
- return
- # check if the dir exists, and if not, ask if we should create it?
- # Clear our status cache, & re-check what's installed
- self._ds.download_dir = download_dir
- try:
- self._fill_table()
- except HTTPError as e:
- showerror('Error reading from server', e)
- except URLError as e:
- showerror('Error connecting to server', e.reason)
- self._show_info()
- def _show_info(self):
- print('showing info', self._ds.url)
- for entry, cb in self._info.values():
- entry['state'] = 'normal'
- entry.delete(0, 'end')
- self._info['url'][0].insert(0, self._ds.url)
- self._info['download_dir'][0].insert(0, self._ds.download_dir)
- for entry, cb in self._info.values():
- entry['state'] = 'disabled'
- def _prev_tab(self, *e):
- for i, tab in enumerate(self._tab_names):
- if tab.lower() == self._tab and i > 0:
- self._tab = self._tab_names[i - 1].lower()
- try:
- return self._fill_table()
- except HTTPError as e:
- showerror('Error reading from server', e)
- except URLError as e:
- showerror('Error connecting to server', e.reason)
- def _next_tab(self, *e):
- for i, tab in enumerate(self._tab_names):
- if tab.lower() == self._tab and i < (len(self._tabs) - 1):
- self._tab = self._tab_names[i + 1].lower()
- try:
- return self._fill_table()
- except HTTPError as e:
- showerror('Error reading from server', e)
- except URLError as e:
- showerror('Error connecting to server', e.reason)
- def _select_tab(self, event):
- self._tab = event.widget['text'].lower()
- try:
- self._fill_table()
- except HTTPError as e:
- showerror('Error reading from server', e)
- except URLError as e:
- showerror('Error connecting to server', e.reason)
- _tab = 'collections'
- # _tab = 'corpora'
- _rows = None
- def _fill_table(self):
- selected_row = self._table.selected_row()
- self._table.clear()
- if self._tab == 'all packages':
- items = self._ds.packages()
- elif self._tab == 'corpora':
- items = self._ds.corpora()
- elif self._tab == 'models':
- items = self._ds.models()
- elif self._tab == 'collections':
- items = self._ds.collections()
- else:
- assert 0, 'bad tab value %r' % self._tab
- rows = [self._package_to_columns(item) for item in items]
- self._table.extend(rows)
- # Highlight the active tab.
- for tab, label in self._tabs.items():
- if tab == self._tab:
- label.configure(
- foreground=self._FRONT_TAB_COLOR[0],
- background=self._FRONT_TAB_COLOR[1],
- )
- else:
- label.configure(
- foreground=self._BACK_TAB_COLOR[0],
- background=self._BACK_TAB_COLOR[1],
- )
- self._table.sort_by('Identifier', order='ascending')
- self._color_table()
- self._table.select(selected_row)
- # This is a hack, because the scrollbar isn't updating its
- # position right -- I'm not sure what the underlying cause is
- # though. (This is on OS X w/ python 2.5) The length of
- # delay that's necessary seems to depend on how fast the
- # comptuer is. :-/
- self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview())
- self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview())
- def _update_table_status(self):
- for row_num in range(len(self._table)):
- status = self._ds.status(self._table[row_num, 'Identifier'])
- self._table[row_num, 'Status'] = status
- self._color_table()
- def _download(self, *e):
- # If we're using threads, then delegate to the threaded
- # downloader instead.
- if self._use_threads:
- return self._download_threaded(*e)
- marked = [
- self._table[row, 'Identifier']
- for row in range(len(self._table))
- if self._table[row, 0] != ''
- ]
- selection = self._table.selected_row()
- if not marked and selection is not None:
- marked = [self._table[selection, 'Identifier']]
- download_iter = self._ds.incr_download(marked, self._ds.download_dir)
- self._log_indent = 0
- self._download_cb(download_iter, marked)
- _DL_DELAY = 10
- def _download_cb(self, download_iter, ids):
- try:
- msg = next(download_iter)
- except StopIteration:
- # self._fill_table(sort=False)
- self._update_table_status()
- afterid = self.top.after(10, self._show_progress, 0)
- self._afterid['_download_cb'] = afterid
- return
- def show(s):
- self._progresslabel['text'] = s
- self._log(s)
- if isinstance(msg, ProgressMessage):
- self._show_progress(msg.progress)
- elif isinstance(msg, ErrorMessage):
- show(msg.message)
- if msg.package is not None:
- self._select(msg.package.id)
- self._show_progress(None)
- return # halt progress.
- elif isinstance(msg, StartCollectionMessage):
- show('Downloading collection %s' % msg.collection.id)
- self._log_indent += 1
- elif isinstance(msg, StartPackageMessage):
- show('Downloading package %s' % msg.package.id)
- elif isinstance(msg, UpToDateMessage):
- show('Package %s is up-to-date!' % msg.package.id)
- # elif isinstance(msg, StaleMessage):
- # show('Package %s is out-of-date or corrupt' % msg.package.id)
- elif isinstance(msg, FinishDownloadMessage):
- show('Finished downloading %r.' % msg.package.id)
- elif isinstance(msg, StartUnzipMessage):
- show('Unzipping %s' % msg.package.filename)
- elif isinstance(msg, FinishCollectionMessage):
- self._log_indent -= 1
- show('Finished downloading collection %r.' % msg.collection.id)
- self._clear_mark(msg.collection.id)
- elif isinstance(msg, FinishPackageMessage):
- self._clear_mark(msg.package.id)
- afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids)
- self._afterid['_download_cb'] = afterid
- def _select(self, id):
- for row in range(len(self._table)):
- if self._table[row, 'Identifier'] == id:
- self._table.select(row)
- return
- def _color_table(self):
- # Color rows according to status.
- for row in range(len(self._table)):
- bg, sbg = self._ROW_COLOR[self._table[row, 'Status']]
- fg, sfg = ('black', 'white')
- self._table.rowconfig(
- row,
- foreground=fg,
- selectforeground=sfg,
- background=bg,
- selectbackground=sbg,
- )
- # Color the marked column
- self._table.itemconfigure(
- row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1]
- )
- def _clear_mark(self, id):
- for row in range(len(self._table)):
- if self._table[row, 'Identifier'] == id:
- self._table[row, 0] = ''
- def _mark_all(self, *e):
- for row in range(len(self._table)):
- self._table[row, 0] = 'X'
- def _table_mark(self, *e):
- selection = self._table.selected_row()
- if selection >= 0:
- if self._table[selection][0] != '':
- self._table[selection, 0] = ''
- else:
- self._table[selection, 0] = 'X'
- self._table.select(delta=1)
- def _show_log(self):
- text = '\n'.join(self._log_messages)
- ShowText(self.top, 'NLTK Downloader Log', text)
- def _package_to_columns(self, pkg):
- """
- Given a package, return a list of values describing that
- package, one for each column in ``self.COLUMNS``.
- """
- row = []
- for column_index, column_name in enumerate(self.COLUMNS):
- if column_index == 0: # Mark:
- row.append('')
- elif column_name == 'Identifier':
- row.append(pkg.id)
- elif column_name == 'Status':
- row.append(self._ds.status(pkg))
- else:
- attr = column_name.lower().replace(' ', '_')
- row.append(getattr(pkg, attr, 'n/a'))
- return row
- # /////////////////////////////////////////////////////////////////
- # External Interface
- # /////////////////////////////////////////////////////////////////
- def destroy(self, *e):
- if self._destroyed:
- return
- self.top.destroy()
- self._destroyed = True
- def _destroy(self, *e):
- if self.top is not None:
- for afterid in self._afterid.values():
- self.top.after_cancel(afterid)
- # Abort any download in progress.
- if self._downloading and self._use_threads:
- self._abort_download()
- # Make sure the garbage collector destroys these now;
- # otherwise, they may get destroyed when we're not in the main
- # thread, which would make Tkinter unhappy.
- self._column_vars.clear()
- def mainloop(self, *args, **kwargs):
- self.top.mainloop(*args, **kwargs)
- # /////////////////////////////////////////////////////////////////
- # HELP
- # /////////////////////////////////////////////////////////////////
- HELP = textwrap.dedent(
- """\
- This tool can be used to download a variety of corpora and models
- that can be used with NLTK. Each corpus or model is distributed
- in a single zip file, known as a \"package file.\" You can
- download packages individually, or you can download pre-defined
- collections of packages.
- When you download a package, it will be saved to the \"download
- directory.\" A default download directory is chosen when you run
- the downloader; but you may also select a different download
- directory. On Windows, the default download directory is
- \"package.\"
- The NLTK downloader can be used to download a variety of corpora,
- models, and other data packages.
- Keyboard shortcuts::
- [return]\t Download
- [up]\t Select previous package
- [down]\t Select next package
- [left]\t Select previous tab
- [right]\t Select next tab
- """
- )
- def help(self, *e):
- # The default font's not very legible; try using 'fixed' instead.
- try:
- ShowText(
- self.top,
- 'Help: NLTK Dowloader',
- self.HELP.strip(),
- width=75,
- font='fixed',
- )
- except:
- ShowText(self.top, 'Help: NLTK Downloader', self.HELP.strip(), width=75)
- def about(self, *e):
- ABOUT = "NLTK Downloader\n" + "Written by Edward Loper"
- TITLE = 'About: NLTK Downloader'
- try:
- from six.moves.tkinter_messagebox import Message
- Message(message=ABOUT, title=TITLE).show()
- except ImportError:
- ShowText(self.top, TITLE, ABOUT)
- # /////////////////////////////////////////////////////////////////
- # Progress Bar
- # /////////////////////////////////////////////////////////////////
- _gradient_width = 5
- def _init_progressbar(self):
- c = self._progressbar
- width, height = int(c['width']), int(c['height'])
- for i in range(0, (int(c['width']) * 2) // self._gradient_width):
- c.create_line(
- i * self._gradient_width + 20,
- -20,
- i * self._gradient_width - height - 20,
- height + 20,
- width=self._gradient_width,
- fill='#%02x0000' % (80 + abs(i % 6 - 3) * 12),
- )
- c.addtag_all('gradient')
- c.itemconfig('gradient', state='hidden')
- # This is used to display progress
- c.addtag_withtag(
- 'redbox', c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])
- )
- def _show_progress(self, percent):
- c = self._progressbar
- if percent is None:
- c.coords('redbox', 0, 0, 0, 0)
- c.itemconfig('gradient', state='hidden')
- else:
- width, height = int(c['width']), int(c['height'])
- x = percent * int(width) // 100 + 1
- c.coords('redbox', 0, 0, x, height + 1)
- def _progress_alive(self):
- c = self._progressbar
- if not self._downloading:
- c.itemconfig('gradient', state='hidden')
- else:
- c.itemconfig('gradient', state='normal')
- x1, y1, x2, y2 = c.bbox('gradient')
- if x1 <= -100:
- c.move('gradient', (self._gradient_width * 6) - 4, 0)
- else:
- c.move('gradient', -4, 0)
- afterid = self.top.after(200, self._progress_alive)
- self._afterid['_progress_alive'] = afterid
- # /////////////////////////////////////////////////////////////////
- # Threaded downloader
- # /////////////////////////////////////////////////////////////////
- def _download_threaded(self, *e):
- # If the user tries to start a new download while we're already
- # downloading something, then abort the current download instead.
- if self._downloading:
- self._abort_download()
- return
- # Change the 'download' button to an 'abort' button.
- self._download_button['text'] = 'Cancel'
- marked = [
- self._table[row, 'Identifier']
- for row in range(len(self._table))
- if self._table[row, 0] != ''
- ]
- selection = self._table.selected_row()
- if not marked and selection is not None:
- marked = [self._table[selection, 'Identifier']]
- # Create a new data server object for the download operation,
- # just in case the user modifies our data server during the
- # download (e.g., clicking 'refresh' or editing the index url).
- ds = Downloader(self._ds.url, self._ds.download_dir)
- # Start downloading in a separate thread.
- assert self._download_msg_queue == []
- assert self._download_abort_queue == []
- self._DownloadThread(
- ds,
- marked,
- self._download_lock,
- self._download_msg_queue,
- self._download_abort_queue,
- ).start()
- # Monitor the download message queue & display its progress.
- self._log_indent = 0
- self._downloading = True
- self._monitor_message_queue()
- # Display an indication that we're still alive and well by
- # cycling the progress bar.
- self._progress_alive()
- def _abort_download(self):
- if self._downloading:
- self._download_lock.acquire()
- self._download_abort_queue.append('abort')
- self._download_lock.release()
- class _DownloadThread(threading.Thread):
- def __init__(self, data_server, items, lock, message_queue, abort):
- self.data_server = data_server
- self.items = items
- self.lock = lock
- self.message_queue = message_queue
- self.abort = abort
- threading.Thread.__init__(self)
- def run(self):
- for msg in self.data_server.incr_download(self.items):
- self.lock.acquire()
- self.message_queue.append(msg)
- # Check if we've been told to kill ourselves:
- if self.abort:
- self.message_queue.append('aborted')
- self.lock.release()
- return
- self.lock.release()
- self.lock.acquire()
- self.message_queue.append('finished')
- self.lock.release()
- _MONITOR_QUEUE_DELAY = 100
- def _monitor_message_queue(self):
- def show(s):
- self._progresslabel['text'] = s
- self._log(s)
- # Try to acquire the lock; if it's busy, then just try again later.
- if not self._download_lock.acquire():
- return
- for msg in self._download_msg_queue:
- # Done downloading?
- if msg == 'finished' or msg == 'aborted':
- # self._fill_table(sort=False)
- self._update_table_status()
- self._downloading = False
- self._download_button['text'] = 'Download'
- del self._download_msg_queue[:]
- del self._download_abort_queue[:]
- self._download_lock.release()
- if msg == 'aborted':
- show('Download aborted!')
- self._show_progress(None)
- else:
- afterid = self.top.after(100, self._show_progress, None)
- self._afterid['_monitor_message_queue'] = afterid
- return
- # All other messages
- elif isinstance(msg, ProgressMessage):
- self._show_progress(msg.progress)
- elif isinstance(msg, ErrorMessage):
- show(msg.message)
- if msg.package is not None:
- self._select(msg.package.id)
- self._show_progress(None)
- self._downloading = False
- return # halt progress.
- elif isinstance(msg, StartCollectionMessage):
- show('Downloading collection %r' % msg.collection.id)
- self._log_indent += 1
- elif isinstance(msg, StartPackageMessage):
- self._ds.clear_status_cache(msg.package.id)
- show('Downloading package %r' % msg.package.id)
- elif isinstance(msg, UpToDateMessage):
- show('Package %s is up-to-date!' % msg.package.id)
- # elif isinstance(msg, StaleMessage):
- # show('Package %s is out-of-date or corrupt; updating it' %
- # msg.package.id)
- elif isinstance(msg, FinishDownloadMessage):
- show('Finished downloading %r.' % msg.package.id)
- elif isinstance(msg, StartUnzipMessage):
- show('Unzipping %s' % msg.package.filename)
- elif isinstance(msg, FinishUnzipMessage):
- show('Finished installing %s' % msg.package.id)
- elif isinstance(msg, FinishCollectionMessage):
- self._log_indent -= 1
- show('Finished downloading collection %r.' % msg.collection.id)
- self._clear_mark(msg.collection.id)
- elif isinstance(msg, FinishPackageMessage):
- self._update_table_status()
- self._clear_mark(msg.package.id)
- # Let the user know when we're aborting a download (but
- # waiting for a good point to abort it, so we don't end up
- # with a partially unzipped package or anything like that).
- if self._download_abort_queue:
- self._progresslabel['text'] = 'Aborting download...'
- # Clear the message queue and then release the lock
- del self._download_msg_queue[:]
- self._download_lock.release()
- # Check the queue again after MONITOR_QUEUE_DELAY msec.
- afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue)
- self._afterid['_monitor_message_queue'] = afterid
- ######################################################################
- # Helper Functions
- ######################################################################
- # [xx] It may make sense to move these to nltk.internals.
- def md5_hexdigest(file):
- """
- Calculate and return the MD5 checksum for a given file.
- ``file`` may either be a filename or an open stream.
- """
- if isinstance(file, string_types):
- with open(file, 'rb') as infile:
- return _md5_hexdigest(infile)
- return _md5_hexdigest(file)
- def _md5_hexdigest(fp):
- md5_digest = md5()
- while True:
- block = fp.read(1024 * 16) # 16k blocks
- if not block:
- break
- md5_digest.update(block)
- return md5_digest.hexdigest()
- # change this to periodically yield progress messages?
- # [xx] get rid of topdir parameter -- we should be checking
- # this when we build the index, anyway.
- def unzip(filename, root, verbose=True):
- """
- Extract the contents of the zip file ``filename`` into the
- directory ``root``.
- """
- for message in _unzip_iter(filename, root, verbose):
- if isinstance(message, ErrorMessage):
- raise Exception(message)
- def _unzip_iter(filename, root, verbose=True):
- if verbose:
- sys.stdout.write('Unzipping %s' % os.path.split(filename)[1])
- sys.stdout.flush()
- try:
- zf = zipfile.ZipFile(filename)
- except zipfile.error as e:
- yield ErrorMessage(filename, 'Error with downloaded zip file')
- return
- except Exception as e:
- yield ErrorMessage(filename, e)
- return
- # Get lists of directories & files
- namelist = zf.namelist()
- dirlist = set()
- for x in namelist:
- if x.endswith('/'):
- dirlist.add(x)
- else:
- dirlist.add(x.rsplit('/', 1)[0] + '/')
- filelist = [x for x in namelist if not x.endswith('/')]
- # Create the target directory if it doesn't exist
- if not os.path.exists(root):
- os.mkdir(root)
- # Create the directory structure
- for dirname in sorted(dirlist):
- pieces = dirname[:-1].split('/')
- for i in range(len(pieces)):
- dirpath = os.path.join(root, *pieces[: i + 1])
- if not os.path.exists(dirpath):
- os.mkdir(dirpath)
- # Extract files.
- for i, filename in enumerate(filelist):
- filepath = os.path.join(root, *filename.split('/'))
- try:
- with open(filepath, 'wb') as dstfile, zf.open(filename) as srcfile:
- shutil.copyfileobj(srcfile, dstfile)
- except Exception as e:
- yield ErrorMessage(filename, e)
- return
- if verbose and (i * 10 / len(filelist) > (i - 1) * 10 / len(filelist)):
- sys.stdout.write('.')
- sys.stdout.flush()
- if verbose:
- print()
- ######################################################################
- # Index Builder
- ######################################################################
- # This may move to a different file sometime.
- def build_index(root, base_url):
- """
- Create a new data.xml index file, by combining the xml description
- files for various packages and collections. ``root`` should be the
- path to a directory containing the package xml and zip files; and
- the collection xml files. The ``root`` directory is expected to
- have the following subdirectories::
- root/
- packages/ .................. subdirectory for packages
- corpora/ ................. zip & xml files for corpora
- grammars/ ................ zip & xml files for grammars
- taggers/ ................. zip & xml files for taggers
- tokenizers/ .............. zip & xml files for tokenizers
- etc.
- collections/ ............... xml files for collections
- For each package, there should be two files: ``package.zip``
- (where *package* is the package name)
- which contains the package itself as a compressed zip file; and
- ``package.xml``, which is an xml description of the package. The
- zipfile ``package.zip`` should expand to a single subdirectory
- named ``package/``. The base filename ``package`` must match
- the identifier given in the package's xml file.
- For each collection, there should be a single file ``collection.zip``
- describing the collection, where *collection* is the name of the collection.
- All identifiers (for both packages and collections) must be unique.
- """
- # Find all packages.
- packages = []
- for pkg_xml, zf, subdir in _find_packages(os.path.join(root, 'packages')):
- zipstat = os.stat(zf.filename)
- url = '%s/%s/%s' % (base_url, subdir, os.path.split(zf.filename)[1])
- unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist())
- # Fill in several fields of the package xml with calculated values.
- pkg_xml.set('unzipped_size', '%s' % unzipped_size)
- pkg_xml.set('size', '%s' % zipstat.st_size)
- pkg_xml.set('checksum', '%s' % md5_hexdigest(zf.filename))
- pkg_xml.set('subdir', subdir)
- # pkg_xml.set('svn_revision', _svn_revision(zf.filename))
- if not pkg_xml.get('url'):
- pkg_xml.set('url', url)
- # Record the package.
- packages.append(pkg_xml)
- # Find all collections
- collections = list(_find_collections(os.path.join(root, 'collections')))
- # Check that all UIDs are unique
- uids = set()
- for item in packages + collections:
- if item.get('id') in uids:
- raise ValueError('Duplicate UID: %s' % item.get('id'))
- uids.add(item.get('id'))
- # Put it all together
- top_elt = ElementTree.Element('nltk_data')
- top_elt.append(ElementTree.Element('packages'))
- for package in packages:
- top_elt[0].append(package)
- top_elt.append(ElementTree.Element('collections'))
- for collection in collections:
- top_elt[1].append(collection)
- _indent_xml(top_elt)
- return top_elt
- def _indent_xml(xml, prefix=''):
- """
- Helper for ``build_index()``: Given an XML ``ElementTree``, modify it
- (and its descendents) ``text`` and ``tail`` attributes to generate
- an indented tree, where each nested element is indented by 2
- spaces with respect to its parent.
- """
- if len(xml) > 0:
- xml.text = (xml.text or '').strip() + '\n' + prefix + ' '
- for child in xml:
- _indent_xml(child, prefix + ' ')
- for child in xml[:-1]:
- child.tail = (child.tail or '').strip() + '\n' + prefix + ' '
- xml[-1].tail = (xml[-1].tail or '').strip() + '\n' + prefix
- def _check_package(pkg_xml, zipfilename, zf):
- """
- Helper for ``build_index()``: Perform some checks to make sure that
- the given package is consistent.
- """
- # The filename must patch the id given in the XML file.
- uid = os.path.splitext(os.path.split(zipfilename)[1])[0]
- if pkg_xml.get('id') != uid:
- raise ValueError(
- 'package identifier mismatch (%s vs %s)' % (pkg_xml.get('id'), uid)
- )
- # Zip file must expand to a subdir whose name matches uid.
- if sum((name != uid and not name.startswith(uid + '/')) for name in zf.namelist()):
- raise ValueError(
- 'Zipfile %s.zip does not expand to a single '
- 'subdirectory %s/' % (uid, uid)
- )
- # update for git?
- def _svn_revision(filename):
- """
- Helper for ``build_index()``: Calculate the subversion revision
- number for a given file (by using ``subprocess`` to run ``svn``).
- """
- p = subprocess.Popen(
- ['svn', 'status', '-v', filename],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- (stdout, stderr) = p.communicate()
- if p.returncode != 0 or stderr or not stdout:
- raise ValueError(
- 'Error determining svn_revision for %s: %s'
- % (os.path.split(filename)[1], textwrap.fill(stderr))
- )
- return stdout.split()[2]
- def _find_collections(root):
- """
- Helper for ``build_index()``: Yield a list of ElementTree.Element
- objects, each holding the xml for a single package collection.
- """
- packages = []
- for dirname, subdirs, files in os.walk(root):
- for filename in files:
- if filename.endswith('.xml'):
- xmlfile = os.path.join(dirname, filename)
- yield ElementTree.parse(xmlfile).getroot()
- def _find_packages(root):
- """
- Helper for ``build_index()``: Yield a list of tuples
- ``(pkg_xml, zf, subdir)``, where:
- - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a
- package
- - ``zf`` is a ``zipfile.ZipFile`` for the package's contents.
- - ``subdir`` is the subdirectory (relative to ``root``) where
- the package was found (e.g. 'corpora' or 'grammars').
- """
- from nltk.corpus.reader.util import _path_from
- # Find all packages.
- packages = []
- for dirname, subdirs, files in os.walk(root):
- relpath = '/'.join(_path_from(root, dirname))
- for filename in files:
- if filename.endswith('.xml'):
- xmlfilename = os.path.join(dirname, filename)
- zipfilename = xmlfilename[:-4] + '.zip'
- try:
- zf = zipfile.ZipFile(zipfilename)
- except Exception as e:
- raise ValueError('Error reading file %r!\n%s' % (zipfilename, e))
- try:
- pkg_xml = ElementTree.parse(xmlfilename).getroot()
- except Exception as e:
- raise ValueError('Error reading file %r!\n%s' % (xmlfilename, e))
- # Check that the UID matches the filename
- uid = os.path.split(xmlfilename[:-4])[1]
- if pkg_xml.get('id') != uid:
- raise ValueError(
- 'package identifier mismatch (%s '
- 'vs %s)' % (pkg_xml.get('id'), uid)
- )
- # Check that the zipfile expands to a subdir whose
- # name matches the uid.
- if sum(
- (name != uid and not name.startswith(uid + '/'))
- for name in zf.namelist()
- ):
- raise ValueError(
- 'Zipfile %s.zip does not expand to a '
- 'single subdirectory %s/' % (uid, uid)
- )
- yield pkg_xml, zf, relpath
- # Don't recurse into svn subdirectories:
- try:
- subdirs.remove('.svn')
- except ValueError:
- pass
- ######################################################################
- # Main:
- ######################################################################
- # There should be a command-line interface
- # Aliases
- _downloader = Downloader()
- download = _downloader.download
- def download_shell():
- DownloaderShell(_downloader).run()
- def download_gui():
- DownloaderGUI(_downloader).mainloop()
- def update():
- _downloader.update()
- if __name__ == '__main__':
- from optparse import OptionParser
- parser = OptionParser()
- parser.add_option(
- "-d",
- "--dir",
- dest="dir",
- help="download package to directory DIR",
- metavar="DIR",
- )
- parser.add_option(
- "-q",
- "--quiet",
- dest="quiet",
- action="store_true",
- default=False,
- help="work quietly",
- )
- parser.add_option(
- "-f",
- "--force",
- dest="force",
- action="store_true",
- default=False,
- help="download even if already installed",
- )
- parser.add_option(
- "-e",
- "--exit-on-error",
- dest="halt_on_error",
- action="store_true",
- default=False,
- help="exit if an error occurs",
- )
- parser.add_option(
- "-u",
- "--url",
- dest="server_index_url",
- default=os.environ.get('NLTK_DOWNLOAD_URL'),
- help="download server index url",
- )
- (options, args) = parser.parse_args()
- downloader = Downloader(server_index_url=options.server_index_url)
- if args:
- for pkg_id in args:
- rv = downloader.download(
- info_or_id=pkg_id,
- download_dir=options.dir,
- quiet=options.quiet,
- force=options.force,
- halt_on_error=options.halt_on_error,
- )
- if rv == False and options.halt_on_error:
- break
- else:
- downloader.download(
- download_dir=options.dir,
- quiet=options.quiet,
- force=options.force,
- halt_on_error=options.halt_on_error,
- )
|