123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 |
- # -*- coding: utf-8 -*-
- # Natural Language Toolkit: Compatibility
- #
- # Copyright (C) 2001-2019 NLTK Project
- #
- # URL: <http://nltk.org/>
- # For license information, see LICENSE.TXT
- from __future__ import absolute_import, print_function
- import os
- import sys
- from functools import update_wrapper, wraps
- import fractions
- import unicodedata
- from six import string_types, text_type
- # Python 2/3 compatibility layer. Based on six.
- PY3 = sys.version_info[0] == 3
- if PY3:
- def get_im_class(meth):
- return meth.__self__.__class__
- import io
- StringIO = io.StringIO
- BytesIO = io.BytesIO
- from datetime import timezone
- UTC = timezone.utc
- from tempfile import TemporaryDirectory
- else:
- def get_im_class(meth):
- return meth.im_class
- try:
- from cStringIO import StringIO
- except ImportError:
- from StringIO import StringIO
- BytesIO = StringIO
- from datetime import tzinfo, timedelta
- ZERO = timedelta(0)
- HOUR = timedelta(hours=1)
- # A UTC class for python 2.7
- class UTC(tzinfo):
- """UTC"""
- def utcoffset(self, dt):
- return ZERO
- def tzname(self, dt):
- return "UTC"
- def dst(self, dt):
- return ZERO
- UTC = UTC()
- import csv
- import codecs
- import cStringIO
- class UnicodeWriter:
- """
- A CSV writer which will write rows to CSV file "f",
- which is encoded in the given encoding.
- see https://docs.python.org/2/library/csv.html
- """
- def __init__(
- self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds
- ):
- # Redirect output to a queue
- self.queue = cStringIO.StringIO()
- self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
- self.stream = f
- encoder_cls = codecs.getincrementalencoder(encoding)
- self.encoder = encoder_cls(errors=errors)
- def encode(self, data):
- if isinstance(data, string_types):
- return data.encode("utf-8")
- else:
- return data
- def writerow(self, row):
- self.writer.writerow([self.encode(s) for s in row])
- # Fetch UTF-8 output from the queue ...
- data = self.queue.getvalue()
- data = data.decode("utf-8")
- # ... and reencode it into the target encoding
- data = self.encoder.encode(data, 'replace')
- # write to the target stream
- self.stream.write(data)
- # empty queue
- self.queue.truncate(0)
- import warnings as _warnings
- import os as _os
- from tempfile import mkdtemp
- class TemporaryDirectory(object):
- """Create and return a temporary directory. This has the same
- behavior as mkdtemp but can be used as a context manager. For
- example:
- with TemporaryDirectory() as tmpdir:
- ...
- Upon exiting the context, the directory and everything contained
- in it are removed.
- http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
- """
- def __init__(self, suffix="", prefix="tmp", dir=None):
- self._closed = False
- self.name = None # Handle mkdtemp raising an exception
- self.name = mkdtemp(suffix, prefix, dir)
- def __repr__(self):
- return "<{} {!r}>".format(self.__class__.__name__, self.name)
- def __enter__(self):
- return self.name
- def cleanup(self, _warn=False):
- if self.name and not self._closed:
- try:
- self._rmtree(self.name)
- except (TypeError, AttributeError) as ex:
- # Issue #10188: Emit a warning on stderr
- # if the directory could not be cleaned
- # up due to missing globals
- if "None" not in str(ex):
- raise
- print(
- "ERROR: {!r} while cleaning up {!r}".format(ex, self),
- file=sys.stderr,
- )
- return
- self._closed = True
- if _warn:
- self._warn("Implicitly cleaning up {!r}".format(self), Warning)
- def __exit__(self, exc, value, tb):
- self.cleanup()
- def __del__(self):
- # Issue a Warning if implicit cleanup needed
- self.cleanup(_warn=True)
- # XXX (ncoghlan): The following code attempts to make
- # this class tolerant of the module nulling out process
- # that happens during CPython interpreter shutdown
- # Alas, it doesn't actually manage it. See issue #10188
- _listdir = staticmethod(_os.listdir)
- _path_join = staticmethod(_os.path.join)
- _isdir = staticmethod(_os.path.isdir)
- _islink = staticmethod(_os.path.islink)
- _remove = staticmethod(_os.remove)
- _rmdir = staticmethod(_os.rmdir)
- _warn = _warnings.warn
- def _rmtree(self, path):
- # Essentially a stripped down version of shutil.rmtree. We can't
- # use globals because they may be None'ed out at shutdown.
- for name in self._listdir(path):
- fullname = self._path_join(path, name)
- try:
- isdir = self._isdir(fullname) and not self._islink(fullname)
- except OSError:
- isdir = False
- if isdir:
- self._rmtree(fullname)
- else:
- try:
- self._remove(fullname)
- except OSError:
- pass
- try:
- self._rmdir(path)
- except OSError:
- pass
- # ======= Compatibility for datasets that care about Python versions ========
- # The following datasets have a /PY3 subdirectory containing
- # a full copy of the data which has been re-encoded or repickled.
- DATA_UPDATES = [
- ("chunkers", "maxent_ne_chunker"),
- ("help", "tagsets"),
- ("taggers", "maxent_treebank_pos_tagger"),
- ("tokenizers", "punkt"),
- ]
- _PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
- def add_py3_data(path):
- if PY3:
- for item in _PY3_DATA_UPDATES:
- if item in str(path) and "/PY3" not in str(path):
- pos = path.index(item) + len(item)
- if path[pos : pos + 4] == ".zip":
- pos += 4
- path = path[:pos] + "/PY3" + path[pos:]
- break
- return path
- # for use in adding /PY3 to the second (filename) argument
- # of the file pointers in data.py
- def py3_data(init_func):
- def _decorator(*args, **kwargs):
- args = (args[0], add_py3_data(args[1])) + args[2:]
- return init_func(*args, **kwargs)
- return wraps(init_func)(_decorator)
- # ======= Compatibility layer for __str__ and __repr__ ==========
- def remove_accents(text):
- if isinstance(text, bytes):
- text = text.decode('ascii')
- category = unicodedata.category # this gives a small (~10%) speedup
- return ''.join(
- c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
- )
- # Select the best transliteration method:
- try:
- # Older versions of Unidecode are licensed under Artistic License;
- # assume an older version is installed.
- from unidecode import unidecode as transliterate
- except ImportError:
- try:
- # text-unidecode implementation is worse than Unidecode
- # implementation so Unidecode is preferred.
- from text_unidecode import unidecode as transliterate
- except ImportError:
- # This transliteration method should be enough
- # for many Western languages.
- transliterate = remove_accents
- def python_2_unicode_compatible(klass):
- """
- This decorator defines __unicode__ method and fixes
- __repr__ and __str__ methods under Python 2.
- To support Python 2 and 3 with a single code base,
- define __str__ and __repr__ methods returning unicode
- text and apply this decorator to the class.
- Original __repr__ and __str__ would be available
- as unicode_repr and __unicode__ (under both Python 2
- and Python 3).
- """
- if not issubclass(klass, object):
- raise ValueError("This decorator doesn't work for old-style classes")
- # both __unicode__ and unicode_repr are public because they
- # may be useful in console under Python 2.x
- # if __str__ or __repr__ are not overriden in a subclass,
- # they may be already fixed by this decorator in a parent class
- # and we shouldn't them again
- if not _was_fixed(klass.__str__):
- klass.__unicode__ = klass.__str__
- if not PY3:
- klass.__str__ = _7bit(_transliterated(klass.__unicode__))
- if not _was_fixed(klass.__repr__):
- klass.unicode_repr = klass.__repr__
- if not PY3:
- klass.__repr__ = _7bit(klass.unicode_repr)
- return klass
- def unicode_repr(obj):
- """
- For classes that was fixed with @python_2_unicode_compatible
- ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
- the result is returned without "u" letter (to make output the
- same under Python 2.x and Python 3.x); for other variables
- it is the same as ``repr``.
- """
- if PY3:
- return repr(obj)
- # Python 2.x
- if hasattr(obj, 'unicode_repr'):
- return obj.unicode_repr()
- if isinstance(obj, text_type):
- return repr(obj)[1:] # strip "u" letter from output
- return repr(obj)
- def _transliterated(method):
- def wrapper(self):
- return transliterate(method(self))
- update_wrapper(wrapper, method, ["__name__", "__doc__"])
- if hasattr(method, "_nltk_compat_7bit"):
- wrapper._nltk_compat_7bit = method._nltk_compat_7bit
- wrapper._nltk_compat_transliterated = True
- return wrapper
- def _7bit(method):
- def wrapper(self):
- return method(self).encode('ascii', 'backslashreplace')
- update_wrapper(wrapper, method, ["__name__", "__doc__"])
- if hasattr(method, "_nltk_compat_transliterated"):
- wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
- wrapper._nltk_compat_7bit = True
- return wrapper
- def _was_fixed(method):
- return getattr(method, "_nltk_compat_7bit", False) or getattr(
- method, "_nltk_compat_transliterated", False
- )
- class Fraction(fractions.Fraction):
- """
- This is a simplified backwards compatible version of fractions.Fraction
- from Python >=3.5. It adds the `_normalize` parameter such that it does
- not normalize the denominator to the Greatest Common Divisor (gcd) when
- the numerator is 0.
- This is most probably only used by the nltk.translate.bleu_score.py where
- numerator and denominator of the different ngram precisions are mutable.
- But the idea of "mutable" fraction might not be applicable to other usages,
- See http://stackoverflow.com/questions/34561265
- This objects should be deprecated once NLTK stops supporting Python < 3.5
- See https://github.com/nltk/nltk/issues/1330
- """
- def __new__(cls, numerator=0, denominator=None, _normalize=True):
- cls = super(Fraction, cls).__new__(cls, numerator, denominator)
- # To emulate fraction.Fraction.from_float across Python >=2.7,
- # check that numerator is an integer and denominator is not None.
- if not _normalize and type(numerator) == int and denominator:
- cls._numerator = numerator
- cls._denominator = denominator
- return cls
|