common.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. """Common IO api utilities"""
  2. import codecs
  3. from contextlib import closing, contextmanager
  4. import csv
  5. import mmap
  6. import os
  7. import zipfile
  8. import pandas.compat as compat
  9. from pandas.compat import BytesIO, StringIO, string_types, text_type
  10. from pandas.errors import ( # noqa
  11. AbstractMethodError, DtypeWarning, EmptyDataError, ParserError,
  12. ParserWarning)
  13. from pandas.core.dtypes.common import is_file_like, is_number
  14. from pandas.io.formats.printing import pprint_thing
  15. # gh-12665: Alias for now and remove later.
  16. CParserError = ParserError
  17. # common NA values
  18. # no longer excluding inf representations
  19. # '1.#INF','-1.#INF', '1.#INF000000',
  20. _NA_VALUES = {'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A',
  21. 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan',
  22. '-nan', ''}
  23. if compat.PY3:
  24. from urllib.request import urlopen, pathname2url
  25. _urlopen = urlopen
  26. from urllib.parse import urlparse as parse_url
  27. from urllib.parse import (uses_relative, uses_netloc, uses_params,
  28. urlencode, urljoin)
  29. from urllib.error import URLError
  30. from http.client import HTTPException # noqa
  31. else:
  32. from urllib2 import urlopen as _urlopen
  33. from urllib import urlencode, pathname2url # noqa
  34. from urlparse import urlparse as parse_url
  35. from urlparse import uses_relative, uses_netloc, uses_params, urljoin
  36. from urllib2 import URLError # noqa
  37. from httplib import HTTPException # noqa
  38. from contextlib import contextmanager, closing # noqa
  39. from functools import wraps # noqa
  40. # @wraps(_urlopen)
  41. @contextmanager
  42. def urlopen(*args, **kwargs):
  43. with closing(_urlopen(*args, **kwargs)) as f:
  44. yield f
  45. _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
  46. _VALID_URLS.discard('')
  47. class BaseIterator(object):
  48. """Subclass this and provide a "__next__()" method to obtain an iterator.
  49. Useful only when the object being iterated is non-reusable (e.g. OK for a
  50. parser, not for an in-memory table, yes for its iterator)."""
  51. def __iter__(self):
  52. return self
  53. def __next__(self):
  54. raise AbstractMethodError(self)
  55. if not compat.PY3:
  56. BaseIterator.next = lambda self: self.__next__()
  57. def _is_url(url):
  58. """Check to see if a URL has a valid protocol.
  59. Parameters
  60. ----------
  61. url : str or unicode
  62. Returns
  63. -------
  64. isurl : bool
  65. If `url` has a valid protocol return True otherwise False.
  66. """
  67. try:
  68. return parse_url(url).scheme in _VALID_URLS
  69. except Exception:
  70. return False
  71. def _expand_user(filepath_or_buffer):
  72. """Return the argument with an initial component of ~ or ~user
  73. replaced by that user's home directory.
  74. Parameters
  75. ----------
  76. filepath_or_buffer : object to be converted if possible
  77. Returns
  78. -------
  79. expanded_filepath_or_buffer : an expanded filepath or the
  80. input if not expandable
  81. """
  82. if isinstance(filepath_or_buffer, string_types):
  83. return os.path.expanduser(filepath_or_buffer)
  84. return filepath_or_buffer
  85. def _validate_header_arg(header):
  86. if isinstance(header, bool):
  87. raise TypeError("Passing a bool to header is invalid. "
  88. "Use header=None for no header or "
  89. "header=int or list-like of ints to specify "
  90. "the row(s) making up the column names")
  91. def _stringify_path(filepath_or_buffer):
  92. """Attempt to convert a path-like object to a string.
  93. Parameters
  94. ----------
  95. filepath_or_buffer : object to be converted
  96. Returns
  97. -------
  98. str_filepath_or_buffer : maybe a string version of the object
  99. Notes
  100. -----
  101. Objects supporting the fspath protocol (python 3.6+) are coerced
  102. according to its __fspath__ method.
  103. For backwards compatibility with older pythons, pathlib.Path and
  104. py.path objects are specially coerced.
  105. Any other object is passed through unchanged, which includes bytes,
  106. strings, buffers, or anything else that's not even path-like.
  107. """
  108. try:
  109. import pathlib
  110. _PATHLIB_INSTALLED = True
  111. except ImportError:
  112. _PATHLIB_INSTALLED = False
  113. try:
  114. from py.path import local as LocalPath
  115. _PY_PATH_INSTALLED = True
  116. except ImportError:
  117. _PY_PATH_INSTALLED = False
  118. if hasattr(filepath_or_buffer, '__fspath__'):
  119. return filepath_or_buffer.__fspath__()
  120. if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):
  121. return text_type(filepath_or_buffer)
  122. if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):
  123. return filepath_or_buffer.strpath
  124. return _expand_user(filepath_or_buffer)
  125. def is_s3_url(url):
  126. """Check for an s3, s3n, or s3a url"""
  127. try:
  128. return parse_url(url).scheme in ['s3', 's3n', 's3a']
  129. except Exception:
  130. return False
  131. def is_gcs_url(url):
  132. """Check for a gcs url"""
  133. try:
  134. return parse_url(url).scheme in ['gcs', 'gs']
  135. except Exception:
  136. return False
  137. def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
  138. compression=None, mode=None):
  139. """
  140. If the filepath_or_buffer is a url, translate and return the buffer.
  141. Otherwise passthrough.
  142. Parameters
  143. ----------
  144. filepath_or_buffer : a url, filepath (str, py.path.local or pathlib.Path),
  145. or buffer
  146. encoding : the encoding to use to decode py3 bytes, default is 'utf-8'
  147. mode : str, optional
  148. Returns
  149. -------
  150. tuple of ({a filepath_ or buffer or S3File instance},
  151. encoding, str,
  152. compression, str,
  153. should_close, bool)
  154. """
  155. filepath_or_buffer = _stringify_path(filepath_or_buffer)
  156. if _is_url(filepath_or_buffer):
  157. req = _urlopen(filepath_or_buffer)
  158. content_encoding = req.headers.get('Content-Encoding', None)
  159. if content_encoding == 'gzip':
  160. # Override compression based on Content-Encoding header
  161. compression = 'gzip'
  162. reader = BytesIO(req.read())
  163. req.close()
  164. return reader, encoding, compression, True
  165. if is_s3_url(filepath_or_buffer):
  166. from pandas.io import s3
  167. return s3.get_filepath_or_buffer(filepath_or_buffer,
  168. encoding=encoding,
  169. compression=compression,
  170. mode=mode)
  171. if is_gcs_url(filepath_or_buffer):
  172. from pandas.io import gcs
  173. return gcs.get_filepath_or_buffer(filepath_or_buffer,
  174. encoding=encoding,
  175. compression=compression,
  176. mode=mode)
  177. if isinstance(filepath_or_buffer, (compat.string_types,
  178. compat.binary_type,
  179. mmap.mmap)):
  180. return _expand_user(filepath_or_buffer), None, compression, False
  181. if not is_file_like(filepath_or_buffer):
  182. msg = "Invalid file path or buffer object type: {_type}"
  183. raise ValueError(msg.format(_type=type(filepath_or_buffer)))
  184. return filepath_or_buffer, None, compression, False
  185. def file_path_to_url(path):
  186. """
  187. converts an absolute native path to a FILE URL.
  188. Parameters
  189. ----------
  190. path : a path in native format
  191. Returns
  192. -------
  193. a valid FILE URL
  194. """
  195. return urljoin('file:', pathname2url(path))
  196. _compression_to_extension = {
  197. 'gzip': '.gz',
  198. 'bz2': '.bz2',
  199. 'zip': '.zip',
  200. 'xz': '.xz',
  201. }
  202. def _infer_compression(filepath_or_buffer, compression):
  203. """
  204. Get the compression method for filepath_or_buffer. If compression='infer',
  205. the inferred compression method is returned. Otherwise, the input
  206. compression method is returned unchanged, unless it's invalid, in which
  207. case an error is raised.
  208. Parameters
  209. ----------
  210. filepath_or_buffer :
  211. a path (str) or buffer
  212. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
  213. If 'infer' and `filepath_or_buffer` is path-like, then detect
  214. compression from the following extensions: '.gz', '.bz2', '.zip',
  215. or '.xz' (otherwise no compression).
  216. Returns
  217. -------
  218. string or None :
  219. compression method
  220. Raises
  221. ------
  222. ValueError on invalid compression specified
  223. """
  224. # No compression has been explicitly specified
  225. if compression is None:
  226. return None
  227. # Infer compression
  228. if compression == 'infer':
  229. # Convert all path types (e.g. pathlib.Path) to strings
  230. filepath_or_buffer = _stringify_path(filepath_or_buffer)
  231. if not isinstance(filepath_or_buffer, compat.string_types):
  232. # Cannot infer compression of a buffer, assume no compression
  233. return None
  234. # Infer compression from the filename/URL extension
  235. for compression, extension in _compression_to_extension.items():
  236. if filepath_or_buffer.endswith(extension):
  237. return compression
  238. return None
  239. # Compression has been specified. Check that it's valid
  240. if compression in _compression_to_extension:
  241. return compression
  242. msg = 'Unrecognized compression type: {}'.format(compression)
  243. valid = ['infer', None] + sorted(_compression_to_extension)
  244. msg += '\nValid compression types are {}'.format(valid)
  245. raise ValueError(msg)
  246. def _get_handle(path_or_buf, mode, encoding=None, compression=None,
  247. memory_map=False, is_text=True):
  248. """
  249. Get file handle for given path/buffer and mode.
  250. Parameters
  251. ----------
  252. path_or_buf :
  253. a path (str) or buffer
  254. mode : str
  255. mode to open path_or_buf with
  256. encoding : str or None
  257. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
  258. If 'infer' and `filepath_or_buffer` is path-like, then detect
  259. compression from the following extensions: '.gz', '.bz2', '.zip',
  260. or '.xz' (otherwise no compression).
  261. memory_map : boolean, default False
  262. See parsers._parser_params for more information.
  263. is_text : boolean, default True
  264. whether file/buffer is in text format (csv, json, etc.), or in binary
  265. mode (pickle, etc.)
  266. Returns
  267. -------
  268. f : file-like
  269. A file-like object
  270. handles : list of file-like objects
  271. A list of file-like object that were opened in this function.
  272. """
  273. try:
  274. from s3fs import S3File
  275. need_text_wrapping = (BytesIO, S3File)
  276. except ImportError:
  277. need_text_wrapping = (BytesIO,)
  278. handles = list()
  279. f = path_or_buf
  280. # Convert pathlib.Path/py.path.local or string
  281. path_or_buf = _stringify_path(path_or_buf)
  282. is_path = isinstance(path_or_buf, compat.string_types)
  283. if is_path:
  284. compression = _infer_compression(path_or_buf, compression)
  285. if compression:
  286. if compat.PY2 and not is_path and encoding:
  287. msg = 'compression with encoding is not yet supported in Python 2'
  288. raise ValueError(msg)
  289. # GZ Compression
  290. if compression == 'gzip':
  291. import gzip
  292. if is_path:
  293. f = gzip.open(path_or_buf, mode)
  294. else:
  295. f = gzip.GzipFile(fileobj=path_or_buf)
  296. # BZ Compression
  297. elif compression == 'bz2':
  298. import bz2
  299. if is_path:
  300. f = bz2.BZ2File(path_or_buf, mode)
  301. elif compat.PY2:
  302. # Python 2's bz2 module can't take file objects, so have to
  303. # run through decompress manually
  304. f = StringIO(bz2.decompress(path_or_buf.read()))
  305. path_or_buf.close()
  306. else:
  307. f = bz2.BZ2File(path_or_buf)
  308. # ZIP Compression
  309. elif compression == 'zip':
  310. zf = BytesZipFile(path_or_buf, mode)
  311. # Ensure the container is closed as well.
  312. handles.append(zf)
  313. if zf.mode == 'w':
  314. f = zf
  315. elif zf.mode == 'r':
  316. zip_names = zf.namelist()
  317. if len(zip_names) == 1:
  318. f = zf.open(zip_names.pop())
  319. elif len(zip_names) == 0:
  320. raise ValueError('Zero files found in ZIP file {}'
  321. .format(path_or_buf))
  322. else:
  323. raise ValueError('Multiple files found in ZIP file.'
  324. ' Only one file per ZIP: {}'
  325. .format(zip_names))
  326. # XZ Compression
  327. elif compression == 'xz':
  328. lzma = compat.import_lzma()
  329. f = lzma.LZMAFile(path_or_buf, mode)
  330. # Unrecognized Compression
  331. else:
  332. msg = 'Unrecognized compression type: {}'.format(compression)
  333. raise ValueError(msg)
  334. handles.append(f)
  335. elif is_path:
  336. if compat.PY2:
  337. # Python 2
  338. mode = "wb" if mode == "w" else mode
  339. f = open(path_or_buf, mode)
  340. elif encoding:
  341. # Python 3 and encoding
  342. f = open(path_or_buf, mode, encoding=encoding, newline="")
  343. elif is_text:
  344. # Python 3 and no explicit encoding
  345. f = open(path_or_buf, mode, errors='replace', newline="")
  346. else:
  347. # Python 3 and binary mode
  348. f = open(path_or_buf, mode)
  349. handles.append(f)
  350. # in Python 3, convert BytesIO or fileobjects passed with an encoding
  351. if (compat.PY3 and is_text and
  352. (compression or isinstance(f, need_text_wrapping))):
  353. from io import TextIOWrapper
  354. f = TextIOWrapper(f, encoding=encoding, newline='')
  355. handles.append(f)
  356. if memory_map and hasattr(f, 'fileno'):
  357. try:
  358. g = MMapWrapper(f)
  359. f.close()
  360. f = g
  361. except Exception:
  362. # we catch any errors that may have occurred
  363. # because that is consistent with the lower-level
  364. # functionality of the C engine (pd.read_csv), so
  365. # leave the file handler as is then
  366. pass
  367. return f, handles
  368. class BytesZipFile(zipfile.ZipFile, BytesIO):
  369. """
  370. Wrapper for standard library class ZipFile and allow the returned file-like
  371. handle to accept byte strings via `write` method.
  372. BytesIO provides attributes of file-like object and ZipFile.writestr writes
  373. bytes strings into a member of the archive.
  374. """
  375. # GH 17778
  376. def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
  377. if mode in ['wb', 'rb']:
  378. mode = mode.replace('b', '')
  379. super(BytesZipFile, self).__init__(file, mode, compression, **kwargs)
  380. def write(self, data):
  381. super(BytesZipFile, self).writestr(self.filename, data)
  382. @property
  383. def closed(self):
  384. return self.fp is None
  385. class MMapWrapper(BaseIterator):
  386. """
  387. Wrapper for the Python's mmap class so that it can be properly read in
  388. by Python's csv.reader class.
  389. Parameters
  390. ----------
  391. f : file object
  392. File object to be mapped onto memory. Must support the 'fileno'
  393. method or have an equivalent attribute
  394. """
  395. def __init__(self, f):
  396. self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
  397. def __getattr__(self, name):
  398. return getattr(self.mmap, name)
  399. def __iter__(self):
  400. return self
  401. def __next__(self):
  402. newline = self.mmap.readline()
  403. # readline returns bytes, not str, in Python 3,
  404. # but Python's CSV reader expects str, so convert
  405. # the output to str before continuing
  406. if compat.PY3:
  407. newline = compat.bytes_to_str(newline)
  408. # mmap doesn't raise if reading past the allocated
  409. # data but instead returns an empty string, so raise
  410. # if that is returned
  411. if newline == '':
  412. raise StopIteration
  413. return newline
  414. if not compat.PY3:
  415. MMapWrapper.next = lambda self: self.__next__()
  416. class UTF8Recoder(BaseIterator):
  417. """
  418. Iterator that reads an encoded stream and reencodes the input to UTF-8
  419. """
  420. def __init__(self, f, encoding):
  421. self.reader = codecs.getreader(encoding)(f)
  422. def read(self, bytes=-1):
  423. return self.reader.read(bytes).encode("utf-8")
  424. def readline(self):
  425. return self.reader.readline().encode("utf-8")
  426. def next(self):
  427. return next(self.reader).encode("utf-8")
  428. if compat.PY3: # pragma: no cover
  429. def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
  430. # ignore encoding
  431. return csv.reader(f, dialect=dialect, **kwds)
  432. def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
  433. return csv.writer(f, dialect=dialect, **kwds)
  434. else:
  435. class UnicodeReader(BaseIterator):
  436. """
  437. A CSV reader which will iterate over lines in the CSV file "f",
  438. which is encoded in the given encoding.
  439. On Python 3, this is replaced (below) by csv.reader, which handles
  440. unicode.
  441. """
  442. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  443. f = UTF8Recoder(f, encoding)
  444. self.reader = csv.reader(f, dialect=dialect, **kwds)
  445. def __next__(self):
  446. row = next(self.reader)
  447. return [compat.text_type(s, "utf-8") for s in row]
  448. class UnicodeWriter(object):
  449. """
  450. A CSV writer which will write rows to CSV file "f",
  451. which is encoded in the given encoding.
  452. """
  453. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  454. # Redirect output to a queue
  455. self.queue = StringIO()
  456. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  457. self.stream = f
  458. self.encoder = codecs.getincrementalencoder(encoding)()
  459. self.quoting = kwds.get("quoting", None)
  460. def writerow(self, row):
  461. def _check_as_is(x):
  462. return (self.quoting == csv.QUOTE_NONNUMERIC and
  463. is_number(x)) or isinstance(x, str)
  464. row = [x if _check_as_is(x)
  465. else pprint_thing(x).encode("utf-8") for x in row]
  466. self.writer.writerow([s for s in row])
  467. # Fetch UTF-8 output from the queue ...
  468. data = self.queue.getvalue()
  469. data = data.decode("utf-8")
  470. # ... and re-encode it into the target encoding
  471. data = self.encoder.encode(data)
  472. # write to the target stream
  473. self.stream.write(data)
  474. # empty queue
  475. self.queue.truncate(0)
  476. def writerows(self, rows):
  477. def _check_as_is(x):
  478. return (self.quoting == csv.QUOTE_NONNUMERIC and
  479. is_number(x)) or isinstance(x, str)
  480. for i, row in enumerate(rows):
  481. rows[i] = [x if _check_as_is(x)
  482. else pprint_thing(x).encode("utf-8") for x in row]
  483. self.writer.writerows([[s for s in row] for row in rows])
  484. # Fetch UTF-8 output from the queue ...
  485. data = self.queue.getvalue()
  486. data = data.decode("utf-8")
  487. # ... and re-encode it into the target encoding
  488. data = self.encoder.encode(data)
  489. # write to the target stream
  490. self.stream.write(data)
  491. # empty queue
  492. self.queue.truncate(0)