packers.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. """
  2. Msgpack serializer support for reading and writing pandas data structures
  3. to disk
  4. portions of msgpack_numpy package, by Lev Givon were incorporated
  5. into this module (and tests_packers.py)
  6. License
  7. =======
  8. Copyright (c) 2013, Lev Givon.
  9. All rights reserved.
  10. Redistribution and use in source and binary forms, with or without
  11. modification, are permitted provided that the following conditions are
  12. met:
  13. * Redistributions of source code must retain the above copyright
  14. notice, this list of conditions and the following disclaimer.
  15. * Redistributions in binary form must reproduce the above
  16. copyright notice, this list of conditions and the following
  17. disclaimer in the documentation and/or other materials provided
  18. with the distribution.
  19. * Neither the name of Lev Givon nor the names of any
  20. contributors may be used to endorse or promote products derived
  21. from this software without specific prior written permission.
  22. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23. "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  25. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  26. OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  27. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  28. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  29. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  30. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  31. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  32. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  33. """
  34. from datetime import date, datetime, timedelta
  35. import os
  36. from textwrap import dedent
  37. import warnings
  38. from dateutil.parser import parse
  39. import numpy as np
  40. import pandas.compat as compat
  41. from pandas.compat import u, u_safe
  42. from pandas.errors import PerformanceWarning
  43. from pandas.util._move import (
  44. BadMove as _BadMove, move_into_mutable_buffer as _move_into_mutable_buffer)
  45. from pandas.core.dtypes.common import (
  46. is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype,
  47. needs_i8_conversion, pandas_dtype)
  48. from pandas import ( # noqa:F401
  49. Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index,
  50. Index, Int64Index, Interval, IntervalIndex, MultiIndex, NaT, Panel, Period,
  51. PeriodIndex, RangeIndex, Series, TimedeltaIndex, Timestamp)
  52. from pandas.core import internals
  53. from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray
  54. from pandas.core.arrays.sparse import BlockIndex, IntIndex
  55. from pandas.core.generic import NDFrame
  56. from pandas.core.internals import BlockManager, _safe_reshape, make_block
  57. from pandas.core.sparse.api import SparseDataFrame, SparseSeries
  58. from pandas.io.common import _stringify_path, get_filepath_or_buffer
  59. from pandas.io.msgpack import ExtType, Packer as _Packer, Unpacker as _Unpacker
  60. # check which compression libs we have installed
  61. try:
  62. import zlib
  63. def _check_zlib():
  64. pass
  65. except ImportError:
  66. def _check_zlib():
  67. raise ImportError('zlib is not installed')
  68. _check_zlib.__doc__ = dedent(
  69. """\
  70. Check if zlib is installed.
  71. Raises
  72. ------
  73. ImportError
  74. Raised when zlib is not installed.
  75. """,
  76. )
  77. try:
  78. import blosc
  79. def _check_blosc():
  80. pass
  81. except ImportError:
  82. def _check_blosc():
  83. raise ImportError('blosc is not installed')
  84. _check_blosc.__doc__ = dedent(
  85. """\
  86. Check if blosc is installed.
  87. Raises
  88. ------
  89. ImportError
  90. Raised when blosc is not installed.
  91. """,
  92. )
  93. # until we can pass this into our conversion functions,
  94. # this is pretty hacky
  95. compressor = None
  96. def to_msgpack(path_or_buf, *args, **kwargs):
  97. """
  98. msgpack (serialize) object to input file path
  99. THIS IS AN EXPERIMENTAL LIBRARY and the storage format
  100. may not be stable until a future release.
  101. Parameters
  102. ----------
  103. path_or_buf : string File path, buffer-like, or None
  104. if None, return generated string
  105. args : an object or objects to serialize
  106. encoding : encoding for unicode objects
  107. append : boolean whether to append to an existing msgpack
  108. (default is False)
  109. compress : type of compressor (zlib or blosc), default to None (no
  110. compression)
  111. """
  112. global compressor
  113. compressor = kwargs.pop('compress', None)
  114. if compressor:
  115. compressor = u(compressor)
  116. append = kwargs.pop('append', None)
  117. if append:
  118. mode = 'a+b'
  119. else:
  120. mode = 'wb'
  121. def writer(fh):
  122. for a in args:
  123. fh.write(pack(a, **kwargs))
  124. path_or_buf = _stringify_path(path_or_buf)
  125. if isinstance(path_or_buf, compat.string_types):
  126. with open(path_or_buf, mode) as fh:
  127. writer(fh)
  128. elif path_or_buf is None:
  129. buf = compat.BytesIO()
  130. writer(buf)
  131. return buf.getvalue()
  132. else:
  133. writer(path_or_buf)
  134. def read_msgpack(path_or_buf, encoding='utf-8', iterator=False, **kwargs):
  135. """
  136. Load msgpack pandas object from the specified
  137. file path
  138. THIS IS AN EXPERIMENTAL LIBRARY and the storage format
  139. may not be stable until a future release.
  140. Parameters
  141. ----------
  142. path_or_buf : string File path, BytesIO like or string
  143. encoding : Encoding for decoding msgpack str type
  144. iterator : boolean, if True, return an iterator to the unpacker
  145. (default is False)
  146. Returns
  147. -------
  148. obj : same type as object stored in file
  149. """
  150. path_or_buf, _, _, should_close = get_filepath_or_buffer(path_or_buf)
  151. if iterator:
  152. return Iterator(path_or_buf)
  153. def read(fh):
  154. unpacked_obj = list(unpack(fh, encoding=encoding, **kwargs))
  155. if len(unpacked_obj) == 1:
  156. return unpacked_obj[0]
  157. if should_close:
  158. try:
  159. path_or_buf.close()
  160. except IOError:
  161. pass
  162. return unpacked_obj
  163. # see if we have an actual file
  164. if isinstance(path_or_buf, compat.string_types):
  165. try:
  166. exists = os.path.exists(path_or_buf)
  167. except (TypeError, ValueError):
  168. exists = False
  169. if exists:
  170. with open(path_or_buf, 'rb') as fh:
  171. return read(fh)
  172. if isinstance(path_or_buf, compat.binary_type):
  173. # treat as a binary-like
  174. fh = None
  175. try:
  176. # We can't distinguish between a path and a buffer of bytes in
  177. # Python 2 so instead assume the first byte of a valid path is
  178. # less than 0x80.
  179. if compat.PY3 or ord(path_or_buf[0]) >= 0x80:
  180. fh = compat.BytesIO(path_or_buf)
  181. return read(fh)
  182. finally:
  183. if fh is not None:
  184. fh.close()
  185. elif hasattr(path_or_buf, 'read') and compat.callable(path_or_buf.read):
  186. # treat as a buffer like
  187. return read(path_or_buf)
  188. raise ValueError('path_or_buf needs to be a string file path or file-like')
  189. dtype_dict = {21: np.dtype('M8[ns]'),
  190. u('datetime64[ns]'): np.dtype('M8[ns]'),
  191. u('datetime64[us]'): np.dtype('M8[us]'),
  192. 22: np.dtype('m8[ns]'),
  193. u('timedelta64[ns]'): np.dtype('m8[ns]'),
  194. u('timedelta64[us]'): np.dtype('m8[us]'),
  195. # this is platform int, which we need to remap to np.int64
  196. # for compat on windows platforms
  197. 7: np.dtype('int64'),
  198. 'category': 'category'
  199. }
  200. def dtype_for(t):
  201. """ return my dtype mapping, whether number or name """
  202. if t in dtype_dict:
  203. return dtype_dict[t]
  204. return np.typeDict.get(t, t)
  205. c2f_dict = {'complex': np.float64,
  206. 'complex128': np.float64,
  207. 'complex64': np.float32}
  208. # windows (32 bit) compat
  209. if hasattr(np, 'float128'):
  210. c2f_dict['complex256'] = np.float128
  211. def c2f(r, i, ctype_name):
  212. """
  213. Convert strings to complex number instance with specified numpy type.
  214. """
  215. ftype = c2f_dict[ctype_name]
  216. return np.typeDict[ctype_name](ftype(r) + 1j * ftype(i))
  217. def convert(values):
  218. """ convert the numpy values to a list """
  219. dtype = values.dtype
  220. if is_categorical_dtype(values):
  221. return values
  222. elif is_object_dtype(dtype):
  223. return values.ravel().tolist()
  224. if needs_i8_conversion(dtype):
  225. values = values.view('i8')
  226. v = values.ravel()
  227. if compressor == 'zlib':
  228. _check_zlib()
  229. # return string arrays like they are
  230. if dtype == np.object_:
  231. return v.tolist()
  232. # convert to a bytes array
  233. v = v.tostring()
  234. return ExtType(0, zlib.compress(v))
  235. elif compressor == 'blosc':
  236. _check_blosc()
  237. # return string arrays like they are
  238. if dtype == np.object_:
  239. return v.tolist()
  240. # convert to a bytes array
  241. v = v.tostring()
  242. return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))
  243. # ndarray (on original dtype)
  244. return ExtType(0, v.tostring())
  245. def unconvert(values, dtype, compress=None):
  246. as_is_ext = isinstance(values, ExtType) and values.code == 0
  247. if as_is_ext:
  248. values = values.data
  249. if is_categorical_dtype(dtype):
  250. return values
  251. elif is_object_dtype(dtype):
  252. return np.array(values, dtype=object)
  253. dtype = pandas_dtype(dtype).base
  254. if not as_is_ext:
  255. values = values.encode('latin1')
  256. if compress:
  257. if compress == u'zlib':
  258. _check_zlib()
  259. decompress = zlib.decompress
  260. elif compress == u'blosc':
  261. _check_blosc()
  262. decompress = blosc.decompress
  263. else:
  264. raise ValueError("compress must be one of 'zlib' or 'blosc'")
  265. try:
  266. return np.frombuffer(
  267. _move_into_mutable_buffer(decompress(values)),
  268. dtype=dtype,
  269. )
  270. except _BadMove as e:
  271. # Pull the decompressed data off of the `_BadMove` exception.
  272. # We don't just store this in the locals because we want to
  273. # minimize the risk of giving users access to a `bytes` object
  274. # whose data is also given to a mutable buffer.
  275. values = e.args[0]
  276. if len(values) > 1:
  277. # The empty string and single characters are memoized in many
  278. # string creating functions in the capi. This case should not
  279. # warn even though we need to make a copy because we are only
  280. # copying at most 1 byte.
  281. warnings.warn(
  282. 'copying data after decompressing; this may mean that'
  283. ' decompress is caching its result',
  284. PerformanceWarning,
  285. )
  286. # fall through to copying `np.fromstring`
  287. # Copy the bytes into a numpy array.
  288. buf = np.frombuffer(values, dtype=dtype)
  289. buf = buf.copy() # required to not mutate the original data
  290. buf.flags.writeable = True
  291. return buf
  292. def encode(obj):
  293. """
  294. Data encoder
  295. """
  296. tobj = type(obj)
  297. if isinstance(obj, Index):
  298. if isinstance(obj, RangeIndex):
  299. return {u'typ': u'range_index',
  300. u'klass': u(obj.__class__.__name__),
  301. u'name': getattr(obj, 'name', None),
  302. u'start': getattr(obj, '_start', None),
  303. u'stop': getattr(obj, '_stop', None),
  304. u'step': getattr(obj, '_step', None)}
  305. elif isinstance(obj, PeriodIndex):
  306. return {u'typ': u'period_index',
  307. u'klass': u(obj.__class__.__name__),
  308. u'name': getattr(obj, 'name', None),
  309. u'freq': u_safe(getattr(obj, 'freqstr', None)),
  310. u'dtype': u(obj.dtype.name),
  311. u'data': convert(obj.asi8),
  312. u'compress': compressor}
  313. elif isinstance(obj, DatetimeIndex):
  314. tz = getattr(obj, 'tz', None)
  315. # store tz info and data as UTC
  316. if tz is not None:
  317. tz = u(tz.zone)
  318. obj = obj.tz_convert('UTC')
  319. return {u'typ': u'datetime_index',
  320. u'klass': u(obj.__class__.__name__),
  321. u'name': getattr(obj, 'name', None),
  322. u'dtype': u(obj.dtype.name),
  323. u'data': convert(obj.asi8),
  324. u'freq': u_safe(getattr(obj, 'freqstr', None)),
  325. u'tz': tz,
  326. u'compress': compressor}
  327. elif isinstance(obj, (IntervalIndex, IntervalArray)):
  328. if isinstance(obj, IntervalIndex):
  329. typ = u'interval_index'
  330. else:
  331. typ = u'interval_array'
  332. return {u'typ': typ,
  333. u'klass': u(obj.__class__.__name__),
  334. u'name': getattr(obj, 'name', None),
  335. u'left': getattr(obj, 'left', None),
  336. u'right': getattr(obj, 'right', None),
  337. u'closed': getattr(obj, 'closed', None)}
  338. elif isinstance(obj, MultiIndex):
  339. return {u'typ': u'multi_index',
  340. u'klass': u(obj.__class__.__name__),
  341. u'names': getattr(obj, 'names', None),
  342. u'dtype': u(obj.dtype.name),
  343. u'data': convert(obj.values),
  344. u'compress': compressor}
  345. else:
  346. return {u'typ': u'index',
  347. u'klass': u(obj.__class__.__name__),
  348. u'name': getattr(obj, 'name', None),
  349. u'dtype': u(obj.dtype.name),
  350. u'data': convert(obj.values),
  351. u'compress': compressor}
  352. elif isinstance(obj, Categorical):
  353. return {u'typ': u'category',
  354. u'klass': u(obj.__class__.__name__),
  355. u'name': getattr(obj, 'name', None),
  356. u'codes': obj.codes,
  357. u'categories': obj.categories,
  358. u'ordered': obj.ordered,
  359. u'compress': compressor}
  360. elif isinstance(obj, Series):
  361. if isinstance(obj, SparseSeries):
  362. raise NotImplementedError(
  363. 'msgpack sparse series is not implemented'
  364. )
  365. # d = {'typ': 'sparse_series',
  366. # 'klass': obj.__class__.__name__,
  367. # 'dtype': obj.dtype.name,
  368. # 'index': obj.index,
  369. # 'sp_index': obj.sp_index,
  370. # 'sp_values': convert(obj.sp_values),
  371. # 'compress': compressor}
  372. # for f in ['name', 'fill_value', 'kind']:
  373. # d[f] = getattr(obj, f, None)
  374. # return d
  375. else:
  376. return {u'typ': u'series',
  377. u'klass': u(obj.__class__.__name__),
  378. u'name': getattr(obj, 'name', None),
  379. u'index': obj.index,
  380. u'dtype': u(obj.dtype.name),
  381. u'data': convert(obj.values),
  382. u'compress': compressor}
  383. elif issubclass(tobj, NDFrame):
  384. if isinstance(obj, SparseDataFrame):
  385. raise NotImplementedError(
  386. 'msgpack sparse frame is not implemented'
  387. )
  388. # d = {'typ': 'sparse_dataframe',
  389. # 'klass': obj.__class__.__name__,
  390. # 'columns': obj.columns}
  391. # for f in ['default_fill_value', 'default_kind']:
  392. # d[f] = getattr(obj, f, None)
  393. # d['data'] = dict([(name, ss)
  394. # for name, ss in compat.iteritems(obj)])
  395. # return d
  396. else:
  397. data = obj._data
  398. if not data.is_consolidated():
  399. data = data.consolidate()
  400. # the block manager
  401. return {u'typ': u'block_manager',
  402. u'klass': u(obj.__class__.__name__),
  403. u'axes': data.axes,
  404. u'blocks': [{u'locs': b.mgr_locs.as_array,
  405. u'values': convert(b.values),
  406. u'shape': b.values.shape,
  407. u'dtype': u(b.dtype.name),
  408. u'klass': u(b.__class__.__name__),
  409. u'compress': compressor} for b in data.blocks]
  410. }
  411. elif isinstance(obj, (datetime, date, np.datetime64, timedelta,
  412. np.timedelta64)) or obj is NaT:
  413. if isinstance(obj, Timestamp):
  414. tz = obj.tzinfo
  415. if tz is not None:
  416. tz = u(tz.zone)
  417. freq = obj.freq
  418. if freq is not None:
  419. freq = u(freq.freqstr)
  420. return {u'typ': u'timestamp',
  421. u'value': obj.value,
  422. u'freq': freq,
  423. u'tz': tz}
  424. if obj is NaT:
  425. return {u'typ': u'nat'}
  426. elif isinstance(obj, np.timedelta64):
  427. return {u'typ': u'timedelta64',
  428. u'data': obj.view('i8')}
  429. elif isinstance(obj, timedelta):
  430. return {u'typ': u'timedelta',
  431. u'data': (obj.days, obj.seconds, obj.microseconds)}
  432. elif isinstance(obj, np.datetime64):
  433. return {u'typ': u'datetime64',
  434. u'data': u(str(obj))}
  435. elif isinstance(obj, datetime):
  436. return {u'typ': u'datetime',
  437. u'data': u(obj.isoformat())}
  438. elif isinstance(obj, date):
  439. return {u'typ': u'date',
  440. u'data': u(obj.isoformat())}
  441. raise Exception(
  442. "cannot encode this datetimelike object: {obj}".format(obj=obj))
  443. elif isinstance(obj, Period):
  444. return {u'typ': u'period',
  445. u'ordinal': obj.ordinal,
  446. u'freq': u_safe(obj.freqstr)}
  447. elif isinstance(obj, Interval):
  448. return {u'typ': u'interval',
  449. u'left': obj.left,
  450. u'right': obj.right,
  451. u'closed': obj.closed}
  452. elif isinstance(obj, BlockIndex):
  453. return {u'typ': u'block_index',
  454. u'klass': u(obj.__class__.__name__),
  455. u'blocs': obj.blocs,
  456. u'blengths': obj.blengths,
  457. u'length': obj.length}
  458. elif isinstance(obj, IntIndex):
  459. return {u'typ': u'int_index',
  460. u'klass': u(obj.__class__.__name__),
  461. u'indices': obj.indices,
  462. u'length': obj.length}
  463. elif isinstance(obj, np.ndarray):
  464. return {u'typ': u'ndarray',
  465. u'shape': obj.shape,
  466. u'ndim': obj.ndim,
  467. u'dtype': u(obj.dtype.name),
  468. u'data': convert(obj),
  469. u'compress': compressor}
  470. elif isinstance(obj, np.number):
  471. if np.iscomplexobj(obj):
  472. return {u'typ': u'np_scalar',
  473. u'sub_typ': u'np_complex',
  474. u'dtype': u(obj.dtype.name),
  475. u'real': u(obj.real.__repr__()),
  476. u'imag': u(obj.imag.__repr__())}
  477. else:
  478. return {u'typ': u'np_scalar',
  479. u'dtype': u(obj.dtype.name),
  480. u'data': u(obj.__repr__())}
  481. elif isinstance(obj, complex):
  482. return {u'typ': u'np_complex',
  483. u'real': u(obj.real.__repr__()),
  484. u'imag': u(obj.imag.__repr__())}
  485. return obj
  486. def decode(obj):
  487. """
  488. Decoder for deserializing numpy data types.
  489. """
  490. typ = obj.get(u'typ')
  491. if typ is None:
  492. return obj
  493. elif typ == u'timestamp':
  494. freq = obj[u'freq'] if 'freq' in obj else obj[u'offset']
  495. return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq)
  496. elif typ == u'nat':
  497. return NaT
  498. elif typ == u'period':
  499. return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq'])
  500. elif typ == u'index':
  501. dtype = dtype_for(obj[u'dtype'])
  502. data = unconvert(obj[u'data'], dtype,
  503. obj.get(u'compress'))
  504. return Index(data, dtype=dtype, name=obj[u'name'])
  505. elif typ == u'range_index':
  506. return RangeIndex(obj[u'start'],
  507. obj[u'stop'],
  508. obj[u'step'],
  509. name=obj[u'name'])
  510. elif typ == u'multi_index':
  511. dtype = dtype_for(obj[u'dtype'])
  512. data = unconvert(obj[u'data'], dtype,
  513. obj.get(u'compress'))
  514. data = [tuple(x) for x in data]
  515. return MultiIndex.from_tuples(data, names=obj[u'names'])
  516. elif typ == u'period_index':
  517. data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
  518. d = dict(name=obj[u'name'], freq=obj[u'freq'])
  519. freq = d.pop('freq', None)
  520. return PeriodIndex(PeriodArray(data, freq), **d)
  521. elif typ == u'datetime_index':
  522. data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
  523. d = dict(name=obj[u'name'], freq=obj[u'freq'])
  524. result = DatetimeIndex(data, **d)
  525. tz = obj[u'tz']
  526. # reverse tz conversion
  527. if tz is not None:
  528. result = result.tz_localize('UTC').tz_convert(tz)
  529. return result
  530. elif typ in (u'interval_index', 'interval_array'):
  531. return globals()[obj[u'klass']].from_arrays(obj[u'left'],
  532. obj[u'right'],
  533. obj[u'closed'],
  534. name=obj[u'name'])
  535. elif typ == u'category':
  536. from_codes = globals()[obj[u'klass']].from_codes
  537. return from_codes(codes=obj[u'codes'],
  538. categories=obj[u'categories'],
  539. ordered=obj[u'ordered'])
  540. elif typ == u'interval':
  541. return Interval(obj[u'left'], obj[u'right'], obj[u'closed'])
  542. elif typ == u'series':
  543. dtype = dtype_for(obj[u'dtype'])
  544. pd_dtype = pandas_dtype(dtype)
  545. index = obj[u'index']
  546. result = Series(unconvert(obj[u'data'], dtype, obj[u'compress']),
  547. index=index,
  548. dtype=pd_dtype,
  549. name=obj[u'name'])
  550. return result
  551. elif typ == u'block_manager':
  552. axes = obj[u'axes']
  553. def create_block(b):
  554. values = _safe_reshape(unconvert(
  555. b[u'values'], dtype_for(b[u'dtype']),
  556. b[u'compress']), b[u'shape'])
  557. # locs handles duplicate column names, and should be used instead
  558. # of items; see GH 9618
  559. if u'locs' in b:
  560. placement = b[u'locs']
  561. else:
  562. placement = axes[0].get_indexer(b[u'items'])
  563. if is_datetime64tz_dtype(b[u'dtype']):
  564. assert isinstance(values, np.ndarray), type(values)
  565. assert values.dtype == 'M8[ns]', values.dtype
  566. values = DatetimeArray(values, dtype=b[u'dtype'])
  567. return make_block(values=values,
  568. klass=getattr(internals, b[u'klass']),
  569. placement=placement,
  570. dtype=b[u'dtype'])
  571. blocks = [create_block(b) for b in obj[u'blocks']]
  572. return globals()[obj[u'klass']](BlockManager(blocks, axes))
  573. elif typ == u'datetime':
  574. return parse(obj[u'data'])
  575. elif typ == u'datetime64':
  576. return np.datetime64(parse(obj[u'data']))
  577. elif typ == u'date':
  578. return parse(obj[u'data']).date()
  579. elif typ == u'timedelta':
  580. return timedelta(*obj[u'data'])
  581. elif typ == u'timedelta64':
  582. return np.timedelta64(int(obj[u'data']))
  583. # elif typ == 'sparse_series':
  584. # dtype = dtype_for(obj['dtype'])
  585. # return SparseSeries(
  586. # unconvert(obj['sp_values'], dtype, obj['compress']),
  587. # sparse_index=obj['sp_index'], index=obj['index'],
  588. # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
  589. # elif typ == 'sparse_dataframe':
  590. # return SparseDataFrame(
  591. # obj['data'], columns=obj['columns'],
  592. # default_fill_value=obj['default_fill_value'],
  593. # default_kind=obj['default_kind']
  594. # )
  595. # elif typ == 'sparse_panel':
  596. # return SparsePanel(
  597. # obj['data'], items=obj['items'],
  598. # default_fill_value=obj['default_fill_value'],
  599. # default_kind=obj['default_kind'])
  600. elif typ == u'block_index':
  601. return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'],
  602. obj[u'blengths'])
  603. elif typ == u'int_index':
  604. return globals()[obj[u'klass']](obj[u'length'], obj[u'indices'])
  605. elif typ == u'ndarray':
  606. return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']],
  607. obj.get(u'compress')).reshape(obj[u'shape'])
  608. elif typ == u'np_scalar':
  609. if obj.get(u'sub_typ') == u'np_complex':
  610. return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype'])
  611. else:
  612. dtype = dtype_for(obj[u'dtype'])
  613. try:
  614. return dtype(obj[u'data'])
  615. except (ValueError, TypeError):
  616. return dtype.type(obj[u'data'])
  617. elif typ == u'np_complex':
  618. return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j')
  619. elif isinstance(obj, (dict, list, set)):
  620. return obj
  621. else:
  622. return obj
  623. def pack(o, default=encode,
  624. encoding='utf-8', unicode_errors='strict', use_single_float=False,
  625. autoreset=1, use_bin_type=1):
  626. """
  627. Pack an object and return the packed bytes.
  628. """
  629. return Packer(default=default, encoding=encoding,
  630. unicode_errors=unicode_errors,
  631. use_single_float=use_single_float,
  632. autoreset=autoreset,
  633. use_bin_type=use_bin_type).pack(o)
  634. def unpack(packed, object_hook=decode,
  635. list_hook=None, use_list=False, encoding='utf-8',
  636. unicode_errors='strict', object_pairs_hook=None,
  637. max_buffer_size=0, ext_hook=ExtType):
  638. """
  639. Unpack a packed object, return an iterator
  640. Note: packed lists will be returned as tuples
  641. """
  642. return Unpacker(packed, object_hook=object_hook,
  643. list_hook=list_hook,
  644. use_list=use_list, encoding=encoding,
  645. unicode_errors=unicode_errors,
  646. object_pairs_hook=object_pairs_hook,
  647. max_buffer_size=max_buffer_size,
  648. ext_hook=ext_hook)
  649. class Packer(_Packer):
  650. def __init__(self, default=encode,
  651. encoding='utf-8',
  652. unicode_errors='strict',
  653. use_single_float=False,
  654. autoreset=1,
  655. use_bin_type=1):
  656. super(Packer, self).__init__(default=default,
  657. encoding=encoding,
  658. unicode_errors=unicode_errors,
  659. use_single_float=use_single_float,
  660. autoreset=autoreset,
  661. use_bin_type=use_bin_type)
  662. class Unpacker(_Unpacker):
  663. def __init__(self, file_like=None, read_size=0, use_list=False,
  664. object_hook=decode,
  665. object_pairs_hook=None, list_hook=None, encoding='utf-8',
  666. unicode_errors='strict', max_buffer_size=0, ext_hook=ExtType):
  667. super(Unpacker, self).__init__(file_like=file_like,
  668. read_size=read_size,
  669. use_list=use_list,
  670. object_hook=object_hook,
  671. object_pairs_hook=object_pairs_hook,
  672. list_hook=list_hook,
  673. encoding=encoding,
  674. unicode_errors=unicode_errors,
  675. max_buffer_size=max_buffer_size,
  676. ext_hook=ext_hook)
  677. class Iterator(object):
  678. """ manage the unpacking iteration,
  679. close the file on completion """
  680. def __init__(self, path, **kwargs):
  681. self.path = path
  682. self.kwargs = kwargs
  683. def __iter__(self):
  684. needs_closing = True
  685. try:
  686. # see if we have an actual file
  687. if isinstance(self.path, compat.string_types):
  688. try:
  689. path_exists = os.path.exists(self.path)
  690. except TypeError:
  691. path_exists = False
  692. if path_exists:
  693. fh = open(self.path, 'rb')
  694. else:
  695. fh = compat.BytesIO(self.path)
  696. else:
  697. if not hasattr(self.path, 'read'):
  698. fh = compat.BytesIO(self.path)
  699. else:
  700. # a file-like
  701. needs_closing = False
  702. fh = self.path
  703. unpacker = unpack(fh)
  704. for o in unpacker:
  705. yield o
  706. finally:
  707. if needs_closing:
  708. fh.close()