pytables.py 165 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891
  1. # pylint: disable-msg=E1101,W0613,W0603
  2. """
  3. High level interface to PyTables for reading and writing pandas data structures
  4. to disk
  5. """
  6. import copy
  7. from datetime import date, datetime
  8. from distutils.version import LooseVersion
  9. import itertools
  10. import os
  11. import re
  12. import time
  13. import warnings
  14. import numpy as np
  15. from pandas._libs import algos, lib, writers as libwriters
  16. from pandas._libs.tslibs import timezones
  17. from pandas.compat import PY3, filter, lrange, range, string_types
  18. from pandas.errors import PerformanceWarning
  19. from pandas.core.dtypes.common import (
  20. ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype,
  21. is_datetime64_dtype, is_datetime64tz_dtype, is_list_like,
  22. is_timedelta64_dtype)
  23. from pandas.core.dtypes.missing import array_equivalent
  24. from pandas import (
  25. DataFrame, DatetimeIndex, Index, Int64Index, MultiIndex, Panel,
  26. PeriodIndex, Series, SparseDataFrame, SparseSeries, TimedeltaIndex, compat,
  27. concat, isna, to_datetime)
  28. from pandas.core import config
  29. from pandas.core.algorithms import match, unique
  30. from pandas.core.arrays.categorical import (
  31. Categorical, _factorize_from_iterables)
  32. from pandas.core.arrays.sparse import BlockIndex, IntIndex
  33. from pandas.core.base import StringMixin
  34. import pandas.core.common as com
  35. from pandas.core.computation.pytables import Expr, maybe_expression
  36. from pandas.core.config import get_option
  37. from pandas.core.index import ensure_index
  38. from pandas.core.internals import (
  39. BlockManager, _block2d_to_blocknd, _block_shape, _factor_indexer,
  40. make_block)
  41. from pandas.io.common import _stringify_path
  42. from pandas.io.formats.printing import adjoin, pprint_thing
  43. # versioning attribute
  44. _version = '0.15.2'
  45. # encoding
  46. # PY3 encoding if we don't specify
  47. _default_encoding = 'UTF-8'
  48. def _ensure_decoded(s):
  49. """ if we have bytes, decode them to unicode """
  50. if isinstance(s, np.bytes_):
  51. s = s.decode('UTF-8')
  52. return s
  53. def _ensure_encoding(encoding):
  54. # set the encoding if we need
  55. if encoding is None:
  56. if PY3:
  57. encoding = _default_encoding
  58. return encoding
  59. def _ensure_str(name):
  60. """Ensure that an index / column name is a str (python 3) or
  61. unicode (python 2); otherwise they may be np.string dtype.
  62. Non-string dtypes are passed through unchanged.
  63. https://github.com/pandas-dev/pandas/issues/13492
  64. """
  65. if isinstance(name, compat.string_types):
  66. name = compat.text_type(name)
  67. return name
  68. Term = Expr
  69. def _ensure_term(where, scope_level):
  70. """
  71. ensure that the where is a Term or a list of Term
  72. this makes sure that we are capturing the scope of variables
  73. that are passed
  74. create the terms here with a frame_level=2 (we are 2 levels down)
  75. """
  76. # only consider list/tuple here as an ndarray is automatically a coordinate
  77. # list
  78. level = scope_level + 1
  79. if isinstance(where, (list, tuple)):
  80. wlist = []
  81. for w in filter(lambda x: x is not None, where):
  82. if not maybe_expression(w):
  83. wlist.append(w)
  84. else:
  85. wlist.append(Term(w, scope_level=level))
  86. where = wlist
  87. elif maybe_expression(where):
  88. where = Term(where, scope_level=level)
  89. return where
  90. class PossibleDataLossError(Exception):
  91. pass
  92. class ClosedFileError(Exception):
  93. pass
  94. class IncompatibilityWarning(Warning):
  95. pass
  96. incompatibility_doc = """
  97. where criteria is being ignored as this version [%s] is too old (or
  98. not-defined), read the file in and write it out to a new file to upgrade (with
  99. the copy_to method)
  100. """
  101. class AttributeConflictWarning(Warning):
  102. pass
  103. attribute_conflict_doc = """
  104. the [%s] attribute of the existing index is [%s] which conflicts with the new
  105. [%s], resetting the attribute to None
  106. """
  107. class DuplicateWarning(Warning):
  108. pass
  109. duplicate_doc = """
  110. duplicate entries in table, taking most recently appended
  111. """
  112. performance_doc = """
  113. your performance may suffer as PyTables will pickle object types that it cannot
  114. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  115. """
  116. # formats
  117. _FORMAT_MAP = {
  118. u'f': 'fixed',
  119. u'fixed': 'fixed',
  120. u't': 'table',
  121. u'table': 'table',
  122. }
  123. format_deprecate_doc = """
  124. the table keyword has been deprecated
  125. use the format='fixed(f)|table(t)' keyword instead
  126. fixed(f) : specifies the Fixed format
  127. and is the default for put operations
  128. table(t) : specifies the Table format
  129. and is the default for append operations
  130. """
  131. # map object types
  132. _TYPE_MAP = {
  133. Series: u'series',
  134. SparseSeries: u'sparse_series',
  135. DataFrame: u'frame',
  136. SparseDataFrame: u'sparse_frame',
  137. Panel: u'wide',
  138. }
  139. # storer class map
  140. _STORER_MAP = {
  141. u'Series': 'LegacySeriesFixed',
  142. u'DataFrame': 'LegacyFrameFixed',
  143. u'DataMatrix': 'LegacyFrameFixed',
  144. u'series': 'SeriesFixed',
  145. u'sparse_series': 'SparseSeriesFixed',
  146. u'frame': 'FrameFixed',
  147. u'sparse_frame': 'SparseFrameFixed',
  148. u'wide': 'PanelFixed',
  149. }
  150. # table class map
  151. _TABLE_MAP = {
  152. u'generic_table': 'GenericTable',
  153. u'appendable_series': 'AppendableSeriesTable',
  154. u'appendable_multiseries': 'AppendableMultiSeriesTable',
  155. u'appendable_frame': 'AppendableFrameTable',
  156. u'appendable_multiframe': 'AppendableMultiFrameTable',
  157. u'appendable_panel': 'AppendablePanelTable',
  158. u'worm': 'WORMTable',
  159. u'legacy_frame': 'LegacyFrameTable',
  160. u'legacy_panel': 'LegacyPanelTable',
  161. }
  162. # axes map
  163. _AXES_MAP = {
  164. DataFrame: [0],
  165. Panel: [1, 2]
  166. }
  167. # register our configuration options
  168. dropna_doc = """
  169. : boolean
  170. drop ALL nan rows when appending to a table
  171. """
  172. format_doc = """
  173. : format
  174. default format writing format, if None, then
  175. put will default to 'fixed' and append will default to 'table'
  176. """
  177. with config.config_prefix('io.hdf'):
  178. config.register_option('dropna_table', False, dropna_doc,
  179. validator=config.is_bool)
  180. config.register_option(
  181. 'default_format', None, format_doc,
  182. validator=config.is_one_of_factory(['fixed', 'table', None])
  183. )
  184. # oh the troubles to reduce import time
  185. _table_mod = None
  186. _table_file_open_policy_is_strict = False
  187. def _tables():
  188. global _table_mod
  189. global _table_file_open_policy_is_strict
  190. if _table_mod is None:
  191. import tables
  192. _table_mod = tables
  193. # version requirements
  194. if LooseVersion(tables.__version__) < LooseVersion('3.0.0'):
  195. raise ImportError("PyTables version >= 3.0.0 is required")
  196. # set the file open policy
  197. # return the file open policy; this changes as of pytables 3.1
  198. # depending on the HDF5 version
  199. try:
  200. _table_file_open_policy_is_strict = (
  201. tables.file._FILE_OPEN_POLICY == 'strict')
  202. except AttributeError:
  203. pass
  204. return _table_mod
  205. # interface to/from ###
  206. def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
  207. append=None, **kwargs):
  208. """ store this object, close it if we opened it """
  209. if append:
  210. f = lambda store: store.append(key, value, **kwargs)
  211. else:
  212. f = lambda store: store.put(key, value, **kwargs)
  213. path_or_buf = _stringify_path(path_or_buf)
  214. if isinstance(path_or_buf, string_types):
  215. with HDFStore(path_or_buf, mode=mode, complevel=complevel,
  216. complib=complib) as store:
  217. f(store)
  218. else:
  219. f(path_or_buf)
  220. def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
  221. """
  222. Read from the store, close it if we opened it.
  223. Retrieve pandas object stored in file, optionally based on where
  224. criteria
  225. Parameters
  226. ----------
  227. path_or_buf : string, buffer or path object
  228. Path to the file to open, or an open :class:`pandas.HDFStore` object.
  229. Supports any object implementing the ``__fspath__`` protocol.
  230. This includes :class:`pathlib.Path` and py._path.local.LocalPath
  231. objects.
  232. .. versionadded:: 0.19.0 support for pathlib, py.path.
  233. .. versionadded:: 0.21.0 support for __fspath__ protocol.
  234. key : object, optional
  235. The group identifier in the store. Can be omitted if the HDF file
  236. contains a single pandas object.
  237. mode : {'r', 'r+', 'a'}, optional
  238. Mode to use when opening the file. Ignored if path_or_buf is a
  239. :class:`pandas.HDFStore`. Default is 'r'.
  240. where : list, optional
  241. A list of Term (or convertible) objects.
  242. start : int, optional
  243. Row number to start selection.
  244. stop : int, optional
  245. Row number to stop selection.
  246. columns : list, optional
  247. A list of columns names to return.
  248. iterator : bool, optional
  249. Return an iterator object.
  250. chunksize : int, optional
  251. Number of rows to include in an iteration when using an iterator.
  252. errors : str, default 'strict'
  253. Specifies how encoding and decoding errors are to be handled.
  254. See the errors argument for :func:`open` for a full list
  255. of options.
  256. **kwargs
  257. Additional keyword arguments passed to HDFStore.
  258. Returns
  259. -------
  260. item : object
  261. The selected object. Return type depends on the object stored.
  262. See Also
  263. --------
  264. pandas.DataFrame.to_hdf : Write a HDF file from a DataFrame.
  265. pandas.HDFStore : Low-level access to HDF files.
  266. Examples
  267. --------
  268. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
  269. >>> df.to_hdf('./store.h5', 'data')
  270. >>> reread = pd.read_hdf('./store.h5')
  271. """
  272. if mode not in ['r', 'r+', 'a']:
  273. raise ValueError('mode {0} is not allowed while performing a read. '
  274. 'Allowed modes are r, r+ and a.'.format(mode))
  275. # grab the scope
  276. if 'where' in kwargs:
  277. kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
  278. if isinstance(path_or_buf, HDFStore):
  279. if not path_or_buf.is_open:
  280. raise IOError('The HDFStore must be open for reading.')
  281. store = path_or_buf
  282. auto_close = False
  283. else:
  284. path_or_buf = _stringify_path(path_or_buf)
  285. if not isinstance(path_or_buf, string_types):
  286. raise NotImplementedError('Support for generic buffers has not '
  287. 'been implemented.')
  288. try:
  289. exists = os.path.exists(path_or_buf)
  290. # if filepath is too long
  291. except (TypeError, ValueError):
  292. exists = False
  293. if not exists:
  294. raise compat.FileNotFoundError(
  295. 'File {path} does not exist'.format(path=path_or_buf))
  296. store = HDFStore(path_or_buf, mode=mode, **kwargs)
  297. # can't auto open/close if we are using an iterator
  298. # so delegate to the iterator
  299. auto_close = True
  300. try:
  301. if key is None:
  302. groups = store.groups()
  303. if len(groups) == 0:
  304. raise ValueError('No dataset in HDF5 file.')
  305. candidate_only_group = groups[0]
  306. # For the HDF file to have only one dataset, all other groups
  307. # should then be metadata groups for that candidate group. (This
  308. # assumes that the groups() method enumerates parent groups
  309. # before their children.)
  310. for group_to_check in groups[1:]:
  311. if not _is_metadata_of(group_to_check, candidate_only_group):
  312. raise ValueError('key must be provided when HDF5 file '
  313. 'contains multiple datasets.')
  314. key = candidate_only_group._v_pathname
  315. return store.select(key, auto_close=auto_close, **kwargs)
  316. except (ValueError, TypeError):
  317. # if there is an error, close the store
  318. try:
  319. store.close()
  320. except AttributeError:
  321. pass
  322. raise
  323. def _is_metadata_of(group, parent_group):
  324. """Check if a given group is a metadata group for a given parent_group."""
  325. if group._v_depth <= parent_group._v_depth:
  326. return False
  327. current = group
  328. while current._v_depth > 1:
  329. parent = current._v_parent
  330. if parent == parent_group and current._v_name == 'meta':
  331. return True
  332. current = current._v_parent
  333. return False
  334. class HDFStore(StringMixin):
  335. """
  336. Dict-like IO interface for storing pandas objects in PyTables
  337. either Fixed or Table format.
  338. Parameters
  339. ----------
  340. path : string
  341. File path to HDF5 file
  342. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  343. ``'r'``
  344. Read-only; no data can be modified.
  345. ``'w'``
  346. Write; a new file is created (an existing file with the same
  347. name would be deleted).
  348. ``'a'``
  349. Append; an existing file is opened for reading and writing,
  350. and if the file does not exist it is created.
  351. ``'r+'``
  352. It is similar to ``'a'``, but the file must already exist.
  353. complevel : int, 0-9, default None
  354. Specifies a compression level for data.
  355. A value of 0 disables compression.
  356. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  357. Specifies the compression library to be used.
  358. As of v0.20.2 these additional compressors for Blosc are supported
  359. (default if no compressor specified: 'blosc:blosclz'):
  360. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  361. 'blosc:zlib', 'blosc:zstd'}.
  362. Specifying a compression library which is not available issues
  363. a ValueError.
  364. fletcher32 : bool, default False
  365. If applying compression use the fletcher32 checksum
  366. Examples
  367. --------
  368. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  369. >>> store = pd.HDFStore('test.h5')
  370. >>> store['foo'] = bar # write to HDF5
  371. >>> bar = store['foo'] # retrieve
  372. >>> store.close()
  373. """
  374. def __init__(self, path, mode=None, complevel=None, complib=None,
  375. fletcher32=False, **kwargs):
  376. if 'format' in kwargs:
  377. raise ValueError('format is not a defined argument for HDFStore')
  378. try:
  379. import tables # noqa
  380. except ImportError as ex: # pragma: no cover
  381. raise ImportError('HDFStore requires PyTables, "{ex!s}" problem '
  382. 'importing'.format(ex=ex))
  383. if complib is not None and complib not in tables.filters.all_complibs:
  384. raise ValueError(
  385. "complib only supports {libs} compression.".format(
  386. libs=tables.filters.all_complibs))
  387. if complib is None and complevel is not None:
  388. complib = tables.filters.default_complib
  389. self._path = _stringify_path(path)
  390. if mode is None:
  391. mode = 'a'
  392. self._mode = mode
  393. self._handle = None
  394. self._complevel = complevel if complevel else 0
  395. self._complib = complib
  396. self._fletcher32 = fletcher32
  397. self._filters = None
  398. self.open(mode=mode, **kwargs)
  399. def __fspath__(self):
  400. return self._path
  401. @property
  402. def root(self):
  403. """ return the root node """
  404. self._check_if_open()
  405. return self._handle.root
  406. @property
  407. def filename(self):
  408. return self._path
  409. def __getitem__(self, key):
  410. return self.get(key)
  411. def __setitem__(self, key, value):
  412. self.put(key, value)
  413. def __delitem__(self, key):
  414. return self.remove(key)
  415. def __getattr__(self, name):
  416. """ allow attribute access to get stores """
  417. try:
  418. return self.get(name)
  419. except (KeyError, ClosedFileError):
  420. pass
  421. raise AttributeError(
  422. "'{object}' object has no attribute '{name}'".format(
  423. object=type(self).__name__, name=name))
  424. def __contains__(self, key):
  425. """ check for existence of this key
  426. can match the exact pathname or the pathnm w/o the leading '/'
  427. """
  428. node = self.get_node(key)
  429. if node is not None:
  430. name = node._v_pathname
  431. if name == key or name[1:] == key:
  432. return True
  433. return False
  434. def __len__(self):
  435. return len(self.groups())
  436. def __unicode__(self):
  437. return '{type}\nFile path: {path}\n'.format(
  438. type=type(self), path=pprint_thing(self._path))
  439. def __enter__(self):
  440. return self
  441. def __exit__(self, exc_type, exc_value, traceback):
  442. self.close()
  443. def keys(self):
  444. """
  445. Return a (potentially unordered) list of the keys corresponding to the
  446. objects stored in the HDFStore. These are ABSOLUTE path-names (e.g.
  447. have the leading '/'
  448. """
  449. return [n._v_pathname for n in self.groups()]
  450. def __iter__(self):
  451. return iter(self.keys())
  452. def items(self):
  453. """
  454. iterate on key->group
  455. """
  456. for g in self.groups():
  457. yield g._v_pathname, g
  458. iteritems = items
  459. def open(self, mode='a', **kwargs):
  460. """
  461. Open the file in the specified mode
  462. Parameters
  463. ----------
  464. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  465. See HDFStore docstring or tables.open_file for info about modes
  466. """
  467. tables = _tables()
  468. if self._mode != mode:
  469. # if we are changing a write mode to read, ok
  470. if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
  471. pass
  472. elif mode in ['w']:
  473. # this would truncate, raise here
  474. if self.is_open:
  475. raise PossibleDataLossError(
  476. "Re-opening the file [{0}] with mode [{1}] "
  477. "will delete the current file!"
  478. .format(self._path, self._mode)
  479. )
  480. self._mode = mode
  481. # close and reopen the handle
  482. if self.is_open:
  483. self.close()
  484. if self._complevel and self._complevel > 0:
  485. self._filters = _tables().Filters(self._complevel, self._complib,
  486. fletcher32=self._fletcher32)
  487. try:
  488. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  489. except (IOError) as e: # pragma: no cover
  490. if 'can not be written' in str(e):
  491. print(
  492. 'Opening {path} in read-only mode'.format(path=self._path))
  493. self._handle = tables.open_file(self._path, 'r', **kwargs)
  494. else:
  495. raise
  496. except (ValueError) as e:
  497. # trap PyTables >= 3.1 FILE_OPEN_POLICY exception
  498. # to provide an updated message
  499. if 'FILE_OPEN_POLICY' in str(e):
  500. e = ValueError(
  501. "PyTables [{version}] no longer supports opening multiple "
  502. "files\n"
  503. "even in read-only mode on this HDF5 version "
  504. "[{hdf_version}]. You can accept this\n"
  505. "and not open the same file multiple times at once,\n"
  506. "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
  507. "which allows\n"
  508. "files to be opened multiple times at once\n"
  509. .format(version=tables.__version__,
  510. hdf_version=tables.get_hdf5_version()))
  511. raise e
  512. except (Exception) as e:
  513. # trying to read from a non-existent file causes an error which
  514. # is not part of IOError, make it one
  515. if self._mode == 'r' and 'Unable to open/create file' in str(e):
  516. raise IOError(str(e))
  517. raise
  518. def close(self):
  519. """
  520. Close the PyTables file handle
  521. """
  522. if self._handle is not None:
  523. self._handle.close()
  524. self._handle = None
  525. @property
  526. def is_open(self):
  527. """
  528. return a boolean indicating whether the file is open
  529. """
  530. if self._handle is None:
  531. return False
  532. return bool(self._handle.isopen)
  533. def flush(self, fsync=False):
  534. """
  535. Force all buffered modifications to be written to disk.
  536. Parameters
  537. ----------
  538. fsync : bool (default False)
  539. call ``os.fsync()`` on the file handle to force writing to disk.
  540. Notes
  541. -----
  542. Without ``fsync=True``, flushing may not guarantee that the OS writes
  543. to disk. With fsync, the operation will block until the OS claims the
  544. file has been written; however, other caching layers may still
  545. interfere.
  546. """
  547. if self._handle is not None:
  548. self._handle.flush()
  549. if fsync:
  550. try:
  551. os.fsync(self._handle.fileno())
  552. except OSError:
  553. pass
  554. def get(self, key):
  555. """
  556. Retrieve pandas object stored in file
  557. Parameters
  558. ----------
  559. key : object
  560. Returns
  561. -------
  562. obj : same type as object stored in file
  563. """
  564. group = self.get_node(key)
  565. if group is None:
  566. raise KeyError('No object named {key} in the file'.format(key=key))
  567. return self._read_group(group)
  568. def select(self, key, where=None, start=None, stop=None, columns=None,
  569. iterator=False, chunksize=None, auto_close=False, **kwargs):
  570. """
  571. Retrieve pandas object stored in file, optionally based on where
  572. criteria
  573. Parameters
  574. ----------
  575. key : object
  576. where : list of Term (or convertible) objects, optional
  577. start : integer (defaults to None), row number to start selection
  578. stop : integer (defaults to None), row number to stop selection
  579. columns : a list of columns that if not None, will limit the return
  580. columns
  581. iterator : boolean, return an iterator, default False
  582. chunksize : nrows to include in iteration, return an iterator
  583. auto_close : boolean, should automatically close the store when
  584. finished, default is False
  585. Returns
  586. -------
  587. The selected object
  588. """
  589. group = self.get_node(key)
  590. if group is None:
  591. raise KeyError('No object named {key} in the file'.format(key=key))
  592. # create the storer and axes
  593. where = _ensure_term(where, scope_level=1)
  594. s = self._create_storer(group)
  595. s.infer_axes()
  596. # function to call on iteration
  597. def func(_start, _stop, _where):
  598. return s.read(start=_start, stop=_stop,
  599. where=_where,
  600. columns=columns)
  601. # create the iterator
  602. it = TableIterator(self, s, func, where=where, nrows=s.nrows,
  603. start=start, stop=stop, iterator=iterator,
  604. chunksize=chunksize, auto_close=auto_close)
  605. return it.get_result()
  606. def select_as_coordinates(
  607. self, key, where=None, start=None, stop=None, **kwargs):
  608. """
  609. return the selection as an Index
  610. Parameters
  611. ----------
  612. key : object
  613. where : list of Term (or convertible) objects, optional
  614. start : integer (defaults to None), row number to start selection
  615. stop : integer (defaults to None), row number to stop selection
  616. """
  617. where = _ensure_term(where, scope_level=1)
  618. return self.get_storer(key).read_coordinates(where=where, start=start,
  619. stop=stop, **kwargs)
  620. def select_column(self, key, column, **kwargs):
  621. """
  622. return a single column from the table. This is generally only useful to
  623. select an indexable
  624. Parameters
  625. ----------
  626. key : object
  627. column: the column of interest
  628. Exceptions
  629. ----------
  630. raises KeyError if the column is not found (or key is not a valid
  631. store)
  632. raises ValueError if the column can not be extracted individually (it
  633. is part of a data block)
  634. """
  635. return self.get_storer(key).read_column(column=column, **kwargs)
  636. def select_as_multiple(self, keys, where=None, selector=None, columns=None,
  637. start=None, stop=None, iterator=False,
  638. chunksize=None, auto_close=False, **kwargs):
  639. """ Retrieve pandas objects from multiple tables
  640. Parameters
  641. ----------
  642. keys : a list of the tables
  643. selector : the table to apply the where criteria (defaults to keys[0]
  644. if not supplied)
  645. columns : the columns I want back
  646. start : integer (defaults to None), row number to start selection
  647. stop : integer (defaults to None), row number to stop selection
  648. iterator : boolean, return an iterator, default False
  649. chunksize : nrows to include in iteration, return an iterator
  650. Exceptions
  651. ----------
  652. raises KeyError if keys or selector is not found or keys is empty
  653. raises TypeError if keys is not a list or tuple
  654. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  655. """
  656. # default to single select
  657. where = _ensure_term(where, scope_level=1)
  658. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  659. keys = keys[0]
  660. if isinstance(keys, string_types):
  661. return self.select(key=keys, where=where, columns=columns,
  662. start=start, stop=stop, iterator=iterator,
  663. chunksize=chunksize, **kwargs)
  664. if not isinstance(keys, (list, tuple)):
  665. raise TypeError("keys must be a list/tuple")
  666. if not len(keys):
  667. raise ValueError("keys must have a non-zero length")
  668. if selector is None:
  669. selector = keys[0]
  670. # collect the tables
  671. tbls = [self.get_storer(k) for k in keys]
  672. s = self.get_storer(selector)
  673. # validate rows
  674. nrows = None
  675. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  676. if t is None:
  677. raise KeyError("Invalid table [{key}]".format(key=k))
  678. if not t.is_table:
  679. raise TypeError(
  680. "object [{obj}] is not a table, and cannot be used in all "
  681. "select as multiple".format(obj=t.pathname)
  682. )
  683. if nrows is None:
  684. nrows = t.nrows
  685. elif t.nrows != nrows:
  686. raise ValueError(
  687. "all tables must have exactly the same nrows!")
  688. # axis is the concentation axes
  689. axis = list({t.non_index_axes[0][0] for t in tbls})[0]
  690. def func(_start, _stop, _where):
  691. # retrieve the objs, _where is always passed as a set of
  692. # coordinates here
  693. objs = [t.read(where=_where, columns=columns, start=_start,
  694. stop=_stop, **kwargs) for t in tbls]
  695. # concat and return
  696. return concat(objs, axis=axis,
  697. verify_integrity=False)._consolidate()
  698. # create the iterator
  699. it = TableIterator(self, s, func, where=where, nrows=nrows,
  700. start=start, stop=stop, iterator=iterator,
  701. chunksize=chunksize, auto_close=auto_close)
  702. return it.get_result(coordinates=True)
  703. def put(self, key, value, format=None, append=False, **kwargs):
  704. """
  705. Store object in HDFStore
  706. Parameters
  707. ----------
  708. key : object
  709. value : {Series, DataFrame, Panel}
  710. format : 'fixed(f)|table(t)', default is 'fixed'
  711. fixed(f) : Fixed format
  712. Fast writing/reading. Not-appendable, nor searchable
  713. table(t) : Table format
  714. Write as a PyTables Table structure which may perform
  715. worse but allow more flexible operations like searching
  716. / selecting subsets of the data
  717. append : boolean, default False
  718. This will force Table format, append the input data to the
  719. existing.
  720. data_columns : list of columns to create as data columns, or True to
  721. use all columns. See
  722. `here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
  723. encoding : default None, provide an encoding for strings
  724. dropna : boolean, default False, do not write an ALL nan row to
  725. the store settable by the option 'io.hdf.dropna_table'
  726. """
  727. if format is None:
  728. format = get_option("io.hdf.default_format") or 'fixed'
  729. kwargs = self._validate_format(format, kwargs)
  730. self._write_to_group(key, value, append=append, **kwargs)
  731. def remove(self, key, where=None, start=None, stop=None):
  732. """
  733. Remove pandas object partially by specifying the where condition
  734. Parameters
  735. ----------
  736. key : string
  737. Node to remove or delete rows from
  738. where : list of Term (or convertible) objects, optional
  739. start : integer (defaults to None), row number to start selection
  740. stop : integer (defaults to None), row number to stop selection
  741. Returns
  742. -------
  743. number of rows removed (or None if not a Table)
  744. Exceptions
  745. ----------
  746. raises KeyError if key is not a valid store
  747. """
  748. where = _ensure_term(where, scope_level=1)
  749. try:
  750. s = self.get_storer(key)
  751. except KeyError:
  752. # the key is not a valid store, re-raising KeyError
  753. raise
  754. except Exception:
  755. if where is not None:
  756. raise ValueError(
  757. "trying to remove a node with a non-None where clause!")
  758. # we are actually trying to remove a node (with children)
  759. s = self.get_node(key)
  760. if s is not None:
  761. s._f_remove(recursive=True)
  762. return None
  763. # remove the node
  764. if com._all_none(where, start, stop):
  765. s.group._f_remove(recursive=True)
  766. # delete from the table
  767. else:
  768. if not s.is_table:
  769. raise ValueError(
  770. 'can only remove with where on objects written as tables')
  771. return s.delete(where=where, start=start, stop=stop)
  772. def append(self, key, value, format=None, append=True, columns=None,
  773. dropna=None, **kwargs):
  774. """
  775. Append to Table in file. Node must already exist and be Table
  776. format.
  777. Parameters
  778. ----------
  779. key : object
  780. value : {Series, DataFrame, Panel}
  781. format : 'table' is the default
  782. table(t) : table format
  783. Write as a PyTables Table structure which may perform
  784. worse but allow more flexible operations like searching
  785. / selecting subsets of the data
  786. append : boolean, default True, append the input data to the
  787. existing
  788. data_columns : list of columns, or True, default None
  789. List of columns to create as indexed data columns for on-disk
  790. queries, or True to use all columns. By default only the axes
  791. of the object are indexed. See `here
  792. <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
  793. min_itemsize : dict of columns that specify minimum string sizes
  794. nan_rep : string to use as string nan represenation
  795. chunksize : size to chunk the writing
  796. expectedrows : expected TOTAL row size of this table
  797. encoding : default None, provide an encoding for strings
  798. dropna : boolean, default False, do not write an ALL nan row to
  799. the store settable by the option 'io.hdf.dropna_table'
  800. Notes
  801. -----
  802. Does *not* check if data being appended overlaps with existing
  803. data in the table, so be careful
  804. """
  805. if columns is not None:
  806. raise TypeError("columns is not a supported keyword in append, "
  807. "try data_columns")
  808. if dropna is None:
  809. dropna = get_option("io.hdf.dropna_table")
  810. if format is None:
  811. format = get_option("io.hdf.default_format") or 'table'
  812. kwargs = self._validate_format(format, kwargs)
  813. self._write_to_group(key, value, append=append, dropna=dropna,
  814. **kwargs)
  815. def append_to_multiple(self, d, value, selector, data_columns=None,
  816. axes=None, dropna=False, **kwargs):
  817. """
  818. Append to multiple tables
  819. Parameters
  820. ----------
  821. d : a dict of table_name to table_columns, None is acceptable as the
  822. values of one node (this will get all the remaining columns)
  823. value : a pandas object
  824. selector : a string that designates the indexable table; all of its
  825. columns will be designed as data_columns, unless data_columns is
  826. passed, in which case these are used
  827. data_columns : list of columns to create as data columns, or True to
  828. use all columns
  829. dropna : if evaluates to True, drop rows from all tables if any single
  830. row in each table has all NaN. Default False.
  831. Notes
  832. -----
  833. axes parameter is currently not accepted
  834. """
  835. if axes is not None:
  836. raise TypeError("axes is currently not accepted as a parameter to"
  837. " append_to_multiple; you can create the "
  838. "tables independently instead")
  839. if not isinstance(d, dict):
  840. raise ValueError(
  841. "append_to_multiple must have a dictionary specified as the "
  842. "way to split the value"
  843. )
  844. if selector not in d:
  845. raise ValueError(
  846. "append_to_multiple requires a selector that is in passed dict"
  847. )
  848. # figure out the splitting axis (the non_index_axis)
  849. axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
  850. # figure out how to split the value
  851. remain_key = None
  852. remain_values = []
  853. for k, v in d.items():
  854. if v is None:
  855. if remain_key is not None:
  856. raise ValueError(
  857. "append_to_multiple can only have one value in d that "
  858. "is None"
  859. )
  860. remain_key = k
  861. else:
  862. remain_values.extend(v)
  863. if remain_key is not None:
  864. ordered = value.axes[axis]
  865. ordd = ordered.difference(Index(remain_values))
  866. ordd = sorted(ordered.get_indexer(ordd))
  867. d[remain_key] = ordered.take(ordd)
  868. # data_columns
  869. if data_columns is None:
  870. data_columns = d[selector]
  871. # ensure rows are synchronized across the tables
  872. if dropna:
  873. idxs = (value[cols].dropna(how='all').index for cols in d.values())
  874. valid_index = next(idxs)
  875. for index in idxs:
  876. valid_index = valid_index.intersection(index)
  877. value = value.loc[valid_index]
  878. # append
  879. for k, v in d.items():
  880. dc = data_columns if k == selector else None
  881. # compute the val
  882. val = value.reindex(v, axis=axis)
  883. self.append(k, val, data_columns=dc, **kwargs)
  884. def create_table_index(self, key, **kwargs):
  885. """ Create a pytables index on the table
  886. Parameters
  887. ----------
  888. key : object (the node to index)
  889. Exceptions
  890. ----------
  891. raises if the node is not a table
  892. """
  893. # version requirements
  894. _tables()
  895. s = self.get_storer(key)
  896. if s is None:
  897. return
  898. if not s.is_table:
  899. raise TypeError(
  900. "cannot create table index on a Fixed format store")
  901. s.create_index(**kwargs)
  902. def groups(self):
  903. """return a list of all the top-level nodes (that are not themselves a
  904. pandas storage object)
  905. """
  906. _tables()
  907. self._check_if_open()
  908. return [
  909. g for g in self._handle.walk_groups()
  910. if (not isinstance(g, _table_mod.link.Link) and
  911. (getattr(g._v_attrs, 'pandas_type', None) or
  912. getattr(g, 'table', None) or
  913. (isinstance(g, _table_mod.table.Table) and
  914. g._v_name != u'table')))
  915. ]
  916. def walk(self, where="/"):
  917. """ Walk the pytables group hierarchy for pandas objects
  918. This generator will yield the group path, subgroups and pandas object
  919. names for each group.
  920. Any non-pandas PyTables objects that are not a group will be ignored.
  921. The `where` group itself is listed first (preorder), then each of its
  922. child groups (following an alphanumerical order) is also traversed,
  923. following the same procedure.
  924. .. versionadded:: 0.24.0
  925. Parameters
  926. ----------
  927. where : str, optional
  928. Group where to start walking.
  929. If not supplied, the root group is used.
  930. Yields
  931. ------
  932. path : str
  933. Full path to a group (without trailing '/')
  934. groups : list of str
  935. names of the groups contained in `path`
  936. leaves : list of str
  937. names of the pandas objects contained in `path`
  938. """
  939. _tables()
  940. self._check_if_open()
  941. for g in self._handle.walk_groups(where):
  942. if getattr(g._v_attrs, 'pandas_type', None) is not None:
  943. continue
  944. groups = []
  945. leaves = []
  946. for child in g._v_children.values():
  947. pandas_type = getattr(child._v_attrs, 'pandas_type', None)
  948. if pandas_type is None:
  949. if isinstance(child, _table_mod.group.Group):
  950. groups.append(child._v_name)
  951. else:
  952. leaves.append(child._v_name)
  953. yield (g._v_pathname.rstrip('/'), groups, leaves)
  954. def get_node(self, key):
  955. """ return the node with the key or None if it does not exist """
  956. self._check_if_open()
  957. try:
  958. if not key.startswith('/'):
  959. key = '/' + key
  960. return self._handle.get_node(self.root, key)
  961. except _table_mod.exceptions.NoSuchNodeError:
  962. return None
  963. def get_storer(self, key):
  964. """ return the storer object for a key, raise if not in the file """
  965. group = self.get_node(key)
  966. if group is None:
  967. raise KeyError('No object named {key} in the file'.format(key=key))
  968. s = self._create_storer(group)
  969. s.infer_axes()
  970. return s
  971. def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
  972. complevel=None, fletcher32=False, overwrite=True):
  973. """ copy the existing store to a new file, upgrading in place
  974. Parameters
  975. ----------
  976. propindexes: restore indexes in copied file (defaults to True)
  977. keys : list of keys to include in the copy (defaults to all)
  978. overwrite : overwrite (remove and replace) existing nodes in the
  979. new store (default is True)
  980. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  981. Returns
  982. -------
  983. open file handle of the new store
  984. """
  985. new_store = HDFStore(
  986. file,
  987. mode=mode,
  988. complib=complib,
  989. complevel=complevel,
  990. fletcher32=fletcher32)
  991. if keys is None:
  992. keys = list(self.keys())
  993. if not isinstance(keys, (tuple, list)):
  994. keys = [keys]
  995. for k in keys:
  996. s = self.get_storer(k)
  997. if s is not None:
  998. if k in new_store:
  999. if overwrite:
  1000. new_store.remove(k)
  1001. data = self.select(k)
  1002. if s.is_table:
  1003. index = False
  1004. if propindexes:
  1005. index = [a.name for a in s.axes if a.is_indexed]
  1006. new_store.append(
  1007. k, data, index=index,
  1008. data_columns=getattr(s, 'data_columns', None),
  1009. encoding=s.encoding
  1010. )
  1011. else:
  1012. new_store.put(k, data, encoding=s.encoding)
  1013. return new_store
  1014. def info(self):
  1015. """
  1016. Print detailed information on the store.
  1017. .. versionadded:: 0.21.0
  1018. """
  1019. output = '{type}\nFile path: {path}\n'.format(
  1020. type=type(self), path=pprint_thing(self._path))
  1021. if self.is_open:
  1022. lkeys = sorted(list(self.keys()))
  1023. if len(lkeys):
  1024. keys = []
  1025. values = []
  1026. for k in lkeys:
  1027. try:
  1028. s = self.get_storer(k)
  1029. if s is not None:
  1030. keys.append(pprint_thing(s.pathname or k))
  1031. values.append(
  1032. pprint_thing(s or 'invalid_HDFStore node'))
  1033. except Exception as detail:
  1034. keys.append(k)
  1035. values.append(
  1036. "[invalid_HDFStore node: {detail}]".format(
  1037. detail=pprint_thing(detail)))
  1038. output += adjoin(12, keys, values)
  1039. else:
  1040. output += 'Empty'
  1041. else:
  1042. output += "File is CLOSED"
  1043. return output
  1044. # private methods ######
  1045. def _check_if_open(self):
  1046. if not self.is_open:
  1047. raise ClosedFileError("{0} file is not open!".format(self._path))
  1048. def _validate_format(self, format, kwargs):
  1049. """ validate / deprecate formats; return the new kwargs """
  1050. kwargs = kwargs.copy()
  1051. # validate
  1052. try:
  1053. kwargs['format'] = _FORMAT_MAP[format.lower()]
  1054. except KeyError:
  1055. raise TypeError("invalid HDFStore format specified [{0}]"
  1056. .format(format))
  1057. return kwargs
  1058. def _create_storer(self, group, format=None, value=None, append=False,
  1059. **kwargs):
  1060. """ return a suitable class to operate """
  1061. def error(t):
  1062. raise TypeError(
  1063. "cannot properly create the storer for: [{t}] [group->"
  1064. "{group},value->{value},format->{format},append->{append},"
  1065. "kwargs->{kwargs}]".format(t=t, group=group,
  1066. value=type(value), format=format,
  1067. append=append, kwargs=kwargs))
  1068. pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))
  1069. tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))
  1070. # infer the pt from the passed value
  1071. if pt is None:
  1072. if value is None:
  1073. _tables()
  1074. if (getattr(group, 'table', None) or
  1075. isinstance(group, _table_mod.table.Table)):
  1076. pt = u'frame_table'
  1077. tt = u'generic_table'
  1078. else:
  1079. raise TypeError(
  1080. "cannot create a storer if the object is not existing "
  1081. "nor a value are passed")
  1082. else:
  1083. try:
  1084. pt = _TYPE_MAP[type(value)]
  1085. except KeyError:
  1086. error('_TYPE_MAP')
  1087. # we are actually a table
  1088. if format == 'table':
  1089. pt += u'_table'
  1090. # a storer node
  1091. if u'table' not in pt:
  1092. try:
  1093. return globals()[_STORER_MAP[pt]](self, group, **kwargs)
  1094. except KeyError:
  1095. error('_STORER_MAP')
  1096. # existing node (and must be a table)
  1097. if tt is None:
  1098. # if we are a writer, determine the tt
  1099. if value is not None:
  1100. if pt == u'series_table':
  1101. index = getattr(value, 'index', None)
  1102. if index is not None:
  1103. if index.nlevels == 1:
  1104. tt = u'appendable_series'
  1105. elif index.nlevels > 1:
  1106. tt = u'appendable_multiseries'
  1107. elif pt == u'frame_table':
  1108. index = getattr(value, 'index', None)
  1109. if index is not None:
  1110. if index.nlevels == 1:
  1111. tt = u'appendable_frame'
  1112. elif index.nlevels > 1:
  1113. tt = u'appendable_multiframe'
  1114. elif pt == u'wide_table':
  1115. tt = u'appendable_panel'
  1116. elif pt == u'ndim_table':
  1117. tt = u'appendable_ndim'
  1118. else:
  1119. # distiguish between a frame/table
  1120. tt = u'legacy_panel'
  1121. try:
  1122. fields = group.table._v_attrs.fields
  1123. if len(fields) == 1 and fields[0] == u'value':
  1124. tt = u'legacy_frame'
  1125. except IndexError:
  1126. pass
  1127. try:
  1128. return globals()[_TABLE_MAP[tt]](self, group, **kwargs)
  1129. except KeyError:
  1130. error('_TABLE_MAP')
  1131. def _write_to_group(self, key, value, format, index=True, append=False,
  1132. complib=None, encoding=None, **kwargs):
  1133. group = self.get_node(key)
  1134. # remove the node if we are not appending
  1135. if group is not None and not append:
  1136. self._handle.remove_node(group, recursive=True)
  1137. group = None
  1138. # we don't want to store a table node at all if are object is 0-len
  1139. # as there are not dtypes
  1140. if getattr(value, 'empty', None) and (format == 'table' or append):
  1141. return
  1142. if group is None:
  1143. paths = key.split('/')
  1144. # recursively create the groups
  1145. path = '/'
  1146. for p in paths:
  1147. if not len(p):
  1148. continue
  1149. new_path = path
  1150. if not path.endswith('/'):
  1151. new_path += '/'
  1152. new_path += p
  1153. group = self.get_node(new_path)
  1154. if group is None:
  1155. group = self._handle.create_group(path, p)
  1156. path = new_path
  1157. s = self._create_storer(group, format, value, append=append,
  1158. encoding=encoding, **kwargs)
  1159. if append:
  1160. # raise if we are trying to append to a Fixed format,
  1161. # or a table that exists (and we are putting)
  1162. if (not s.is_table or
  1163. (s.is_table and format == 'fixed' and s.is_exists)):
  1164. raise ValueError('Can only append to Tables')
  1165. if not s.is_exists:
  1166. s.set_object_info()
  1167. else:
  1168. s.set_object_info()
  1169. if not s.is_table and complib:
  1170. raise ValueError(
  1171. 'Compression not supported on Fixed format stores'
  1172. )
  1173. # write the object
  1174. s.write(obj=value, append=append, complib=complib, **kwargs)
  1175. if s.is_table and index:
  1176. s.create_index(columns=index)
  1177. def _read_group(self, group, **kwargs):
  1178. s = self._create_storer(group)
  1179. s.infer_axes()
  1180. return s.read(**kwargs)
  1181. class TableIterator(object):
  1182. """ define the iteration interface on a table
  1183. Parameters
  1184. ----------
  1185. store : the reference store
  1186. s : the referred storer
  1187. func : the function to execute the query
  1188. where : the where of the query
  1189. nrows : the rows to iterate on
  1190. start : the passed start value (default is None)
  1191. stop : the passed stop value (default is None)
  1192. iterator : boolean, whether to use the default iterator
  1193. chunksize : the passed chunking value (default is 50000)
  1194. auto_close : boolean, automatically close the store at the end of
  1195. iteration, default is False
  1196. kwargs : the passed kwargs
  1197. """
  1198. def __init__(self, store, s, func, where, nrows, start=None, stop=None,
  1199. iterator=False, chunksize=None, auto_close=False):
  1200. self.store = store
  1201. self.s = s
  1202. self.func = func
  1203. self.where = where
  1204. # set start/stop if they are not set if we are a table
  1205. if self.s.is_table:
  1206. if nrows is None:
  1207. nrows = 0
  1208. if start is None:
  1209. start = 0
  1210. if stop is None:
  1211. stop = nrows
  1212. stop = min(nrows, stop)
  1213. self.nrows = nrows
  1214. self.start = start
  1215. self.stop = stop
  1216. self.coordinates = None
  1217. if iterator or chunksize is not None:
  1218. if chunksize is None:
  1219. chunksize = 100000
  1220. self.chunksize = int(chunksize)
  1221. else:
  1222. self.chunksize = None
  1223. self.auto_close = auto_close
  1224. def __iter__(self):
  1225. # iterate
  1226. current = self.start
  1227. while current < self.stop:
  1228. stop = min(current + self.chunksize, self.stop)
  1229. value = self.func(None, None, self.coordinates[current:stop])
  1230. current = stop
  1231. if value is None or not len(value):
  1232. continue
  1233. yield value
  1234. self.close()
  1235. def close(self):
  1236. if self.auto_close:
  1237. self.store.close()
  1238. def get_result(self, coordinates=False):
  1239. # return the actual iterator
  1240. if self.chunksize is not None:
  1241. if not self.s.is_table:
  1242. raise TypeError(
  1243. "can only use an iterator or chunksize on a table")
  1244. self.coordinates = self.s.read_coordinates(where=self.where)
  1245. return self
  1246. # if specified read via coordinates (necessary for multiple selections
  1247. if coordinates:
  1248. where = self.s.read_coordinates(where=self.where, start=self.start,
  1249. stop=self.stop)
  1250. else:
  1251. where = self.where
  1252. # directly return the result
  1253. results = self.func(self.start, self.stop, where)
  1254. self.close()
  1255. return results
  1256. class IndexCol(StringMixin):
  1257. """ an index column description class
  1258. Parameters
  1259. ----------
  1260. axis : axis which I reference
  1261. values : the ndarray like converted values
  1262. kind : a string description of this type
  1263. typ : the pytables type
  1264. pos : the position in the pytables
  1265. """
  1266. is_an_indexable = True
  1267. is_data_indexable = True
  1268. _info_fields = ['freq', 'tz', 'index_name']
  1269. def __init__(self, values=None, kind=None, typ=None, cname=None,
  1270. itemsize=None, name=None, axis=None, kind_attr=None,
  1271. pos=None, freq=None, tz=None, index_name=None, **kwargs):
  1272. self.values = values
  1273. self.kind = kind
  1274. self.typ = typ
  1275. self.itemsize = itemsize
  1276. self.name = name
  1277. self.cname = cname
  1278. self.kind_attr = kind_attr
  1279. self.axis = axis
  1280. self.pos = pos
  1281. self.freq = freq
  1282. self.tz = tz
  1283. self.index_name = index_name
  1284. self.table = None
  1285. self.meta = None
  1286. self.metadata = None
  1287. if name is not None:
  1288. self.set_name(name, kind_attr)
  1289. if pos is not None:
  1290. self.set_pos(pos)
  1291. def set_name(self, name, kind_attr=None):
  1292. """ set the name of this indexer """
  1293. self.name = name
  1294. self.kind_attr = kind_attr or "{name}_kind".format(name=name)
  1295. if self.cname is None:
  1296. self.cname = name
  1297. return self
  1298. def set_axis(self, axis):
  1299. """ set the axis over which I index """
  1300. self.axis = axis
  1301. return self
  1302. def set_pos(self, pos):
  1303. """ set the position of this column in the Table """
  1304. self.pos = pos
  1305. if pos is not None and self.typ is not None:
  1306. self.typ._v_pos = pos
  1307. return self
  1308. def set_table(self, table):
  1309. self.table = table
  1310. return self
  1311. def __unicode__(self):
  1312. temp = tuple(
  1313. map(pprint_thing,
  1314. (self.name,
  1315. self.cname,
  1316. self.axis,
  1317. self.pos,
  1318. self.kind)))
  1319. return ','.join(("{key}->{value}".format(key=key, value=value)
  1320. for key, value in zip(
  1321. ['name', 'cname', 'axis', 'pos', 'kind'], temp)))
  1322. def __eq__(self, other):
  1323. """ compare 2 col items """
  1324. return all(getattr(self, a, None) == getattr(other, a, None)
  1325. for a in ['name', 'cname', 'axis', 'pos'])
  1326. def __ne__(self, other):
  1327. return not self.__eq__(other)
  1328. @property
  1329. def is_indexed(self):
  1330. """ return whether I am an indexed column """
  1331. try:
  1332. return getattr(self.table.cols, self.cname).is_indexed
  1333. except AttributeError:
  1334. False
  1335. def copy(self):
  1336. new_self = copy.copy(self)
  1337. return new_self
  1338. def infer(self, handler):
  1339. """infer this column from the table: create and return a new object"""
  1340. table = handler.table
  1341. new_self = self.copy()
  1342. new_self.set_table(table)
  1343. new_self.get_attr()
  1344. new_self.read_metadata(handler)
  1345. return new_self
  1346. def convert(self, values, nan_rep, encoding, errors):
  1347. """ set the values from this selection: take = take ownership """
  1348. # values is a recarray
  1349. if values.dtype.fields is not None:
  1350. values = values[self.cname]
  1351. values = _maybe_convert(values, self.kind, encoding, errors)
  1352. kwargs = dict()
  1353. if self.freq is not None:
  1354. kwargs['freq'] = _ensure_decoded(self.freq)
  1355. if self.index_name is not None:
  1356. kwargs['name'] = _ensure_decoded(self.index_name)
  1357. # making an Index instance could throw a number of different errors
  1358. try:
  1359. self.values = Index(values, **kwargs)
  1360. except Exception: # noqa: E722
  1361. # if the output freq is different that what we recorded,
  1362. # it should be None (see also 'doc example part 2')
  1363. if 'freq' in kwargs:
  1364. kwargs['freq'] = None
  1365. self.values = Index(values, **kwargs)
  1366. self.values = _set_tz(self.values, self.tz)
  1367. return self
  1368. def take_data(self):
  1369. """ return the values & release the memory """
  1370. self.values, values = None, self.values
  1371. return values
  1372. @property
  1373. def attrs(self):
  1374. return self.table._v_attrs
  1375. @property
  1376. def description(self):
  1377. return self.table.description
  1378. @property
  1379. def col(self):
  1380. """ return my current col description """
  1381. return getattr(self.description, self.cname, None)
  1382. @property
  1383. def cvalues(self):
  1384. """ return my cython values """
  1385. return self.values
  1386. def __iter__(self):
  1387. return iter(self.values)
  1388. def maybe_set_size(self, min_itemsize=None):
  1389. """ maybe set a string col itemsize:
  1390. min_itemsize can be an integer or a dict with this columns name
  1391. with an integer size """
  1392. if _ensure_decoded(self.kind) == u'string':
  1393. if isinstance(min_itemsize, dict):
  1394. min_itemsize = min_itemsize.get(self.name)
  1395. if min_itemsize is not None and self.typ.itemsize < min_itemsize:
  1396. self.typ = _tables(
  1397. ).StringCol(itemsize=min_itemsize, pos=self.pos)
  1398. def validate(self, handler, append):
  1399. self.validate_names()
  1400. def validate_names(self):
  1401. pass
  1402. def validate_and_set(self, handler, append):
  1403. self.set_table(handler.table)
  1404. self.validate_col()
  1405. self.validate_attr(append)
  1406. self.validate_metadata(handler)
  1407. self.write_metadata(handler)
  1408. self.set_attr()
  1409. def validate_col(self, itemsize=None):
  1410. """ validate this column: return the compared against itemsize """
  1411. # validate this column for string truncation (or reset to the max size)
  1412. if _ensure_decoded(self.kind) == u'string':
  1413. c = self.col
  1414. if c is not None:
  1415. if itemsize is None:
  1416. itemsize = self.itemsize
  1417. if c.itemsize < itemsize:
  1418. raise ValueError(
  1419. "Trying to store a string with len [{itemsize}] in "
  1420. "[{cname}] column but\nthis column has a limit of "
  1421. "[{c_itemsize}]!\nConsider using min_itemsize to "
  1422. "preset the sizes on these columns".format(
  1423. itemsize=itemsize, cname=self.cname,
  1424. c_itemsize=c.itemsize))
  1425. return c.itemsize
  1426. return None
  1427. def validate_attr(self, append):
  1428. # check for backwards incompatibility
  1429. if append:
  1430. existing_kind = getattr(self.attrs, self.kind_attr, None)
  1431. if existing_kind is not None and existing_kind != self.kind:
  1432. raise TypeError(
  1433. "incompatible kind in col [{existing} - "
  1434. "{self_kind}]".format(
  1435. existing=existing_kind, self_kind=self.kind))
  1436. def update_info(self, info):
  1437. """ set/update the info for this indexable with the key/value
  1438. if there is a conflict raise/warn as needed """
  1439. for key in self._info_fields:
  1440. value = getattr(self, key, None)
  1441. idx = _get_info(info, self.name)
  1442. existing_value = idx.get(key)
  1443. if key in idx and value is not None and existing_value != value:
  1444. # frequency/name just warn
  1445. if key in ['freq', 'index_name']:
  1446. ws = attribute_conflict_doc % (key, existing_value, value)
  1447. warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
  1448. # reset
  1449. idx[key] = None
  1450. setattr(self, key, None)
  1451. else:
  1452. raise ValueError(
  1453. "invalid info for [{name}] for [{key}], "
  1454. "existing_value [{existing_value}] conflicts with "
  1455. "new value [{value}]".format(
  1456. name=self.name, key=key,
  1457. existing_value=existing_value, value=value))
  1458. else:
  1459. if value is not None or existing_value is not None:
  1460. idx[key] = value
  1461. return self
  1462. def set_info(self, info):
  1463. """ set my state from the passed info """
  1464. idx = info.get(self.name)
  1465. if idx is not None:
  1466. self.__dict__.update(idx)
  1467. def get_attr(self):
  1468. """ set the kind for this column """
  1469. self.kind = getattr(self.attrs, self.kind_attr, None)
  1470. def set_attr(self):
  1471. """ set the kind for this column """
  1472. setattr(self.attrs, self.kind_attr, self.kind)
  1473. def read_metadata(self, handler):
  1474. """ retrieve the metadata for this columns """
  1475. self.metadata = handler.read_metadata(self.cname)
  1476. def validate_metadata(self, handler):
  1477. """ validate that kind=category does not change the categories """
  1478. if self.meta == 'category':
  1479. new_metadata = self.metadata
  1480. cur_metadata = handler.read_metadata(self.cname)
  1481. if (new_metadata is not None and cur_metadata is not None and
  1482. not array_equivalent(new_metadata, cur_metadata)):
  1483. raise ValueError("cannot append a categorical with "
  1484. "different categories to the existing")
  1485. def write_metadata(self, handler):
  1486. """ set the meta data """
  1487. if self.metadata is not None:
  1488. handler.write_metadata(self.cname, self.metadata)
  1489. class GenericIndexCol(IndexCol):
  1490. """ an index which is not represented in the data of the table """
  1491. @property
  1492. def is_indexed(self):
  1493. return False
  1494. def convert(self, values, nan_rep, encoding, errors):
  1495. """ set the values from this selection: take = take ownership """
  1496. self.values = Int64Index(np.arange(self.table.nrows))
  1497. return self
  1498. def get_attr(self):
  1499. pass
  1500. def set_attr(self):
  1501. pass
  1502. class DataCol(IndexCol):
  1503. """ a data holding column, by definition this is not indexable
  1504. Parameters
  1505. ----------
  1506. data : the actual data
  1507. cname : the column name in the table to hold the data (typically
  1508. values)
  1509. meta : a string description of the metadata
  1510. metadata : the actual metadata
  1511. """
  1512. is_an_indexable = False
  1513. is_data_indexable = False
  1514. _info_fields = ['tz', 'ordered']
  1515. @classmethod
  1516. def create_for_block(
  1517. cls, i=None, name=None, cname=None, version=None, **kwargs):
  1518. """ return a new datacol with the block i """
  1519. if cname is None:
  1520. cname = name or 'values_block_{idx}'.format(idx=i)
  1521. if name is None:
  1522. name = cname
  1523. # prior to 0.10.1, we named values blocks like: values_block_0 an the
  1524. # name values_0
  1525. try:
  1526. if version[0] == 0 and version[1] <= 10 and version[2] == 0:
  1527. m = re.search(r"values_block_(\d+)", name)
  1528. if m:
  1529. name = "values_{group}".format(group=m.groups()[0])
  1530. except IndexError:
  1531. pass
  1532. return cls(name=name, cname=cname, **kwargs)
  1533. def __init__(self, values=None, kind=None, typ=None,
  1534. cname=None, data=None, meta=None, metadata=None,
  1535. block=None, **kwargs):
  1536. super(DataCol, self).__init__(values=values, kind=kind, typ=typ,
  1537. cname=cname, **kwargs)
  1538. self.dtype = None
  1539. self.dtype_attr = u'{name}_dtype'.format(name=self.name)
  1540. self.meta = meta
  1541. self.meta_attr = u'{name}_meta'.format(name=self.name)
  1542. self.set_data(data)
  1543. self.set_metadata(metadata)
  1544. def __unicode__(self):
  1545. temp = tuple(
  1546. map(pprint_thing,
  1547. (self.name,
  1548. self.cname,
  1549. self.dtype,
  1550. self.kind,
  1551. self.shape)))
  1552. return ','.join(("{key}->{value}".format(key=key, value=value)
  1553. for key, value in zip(
  1554. ['name', 'cname', 'dtype', 'kind', 'shape'], temp)))
  1555. def __eq__(self, other):
  1556. """ compare 2 col items """
  1557. return all(getattr(self, a, None) == getattr(other, a, None)
  1558. for a in ['name', 'cname', 'dtype', 'pos'])
  1559. def set_data(self, data, dtype=None):
  1560. self.data = data
  1561. if data is not None:
  1562. if dtype is not None:
  1563. self.dtype = dtype
  1564. self.set_kind()
  1565. elif self.dtype is None:
  1566. self.dtype = data.dtype.name
  1567. self.set_kind()
  1568. def take_data(self):
  1569. """ return the data & release the memory """
  1570. self.data, data = None, self.data
  1571. return data
  1572. def set_metadata(self, metadata):
  1573. """ record the metadata """
  1574. if metadata is not None:
  1575. metadata = np.array(metadata, copy=False).ravel()
  1576. self.metadata = metadata
  1577. def set_kind(self):
  1578. # set my kind if we can
  1579. if self.dtype is not None:
  1580. dtype = _ensure_decoded(self.dtype)
  1581. if dtype.startswith(u'string') or dtype.startswith(u'bytes'):
  1582. self.kind = 'string'
  1583. elif dtype.startswith(u'float'):
  1584. self.kind = 'float'
  1585. elif dtype.startswith(u'complex'):
  1586. self.kind = 'complex'
  1587. elif dtype.startswith(u'int') or dtype.startswith(u'uint'):
  1588. self.kind = 'integer'
  1589. elif dtype.startswith(u'date'):
  1590. self.kind = 'datetime'
  1591. elif dtype.startswith(u'timedelta'):
  1592. self.kind = 'timedelta'
  1593. elif dtype.startswith(u'bool'):
  1594. self.kind = 'bool'
  1595. else:
  1596. raise AssertionError(
  1597. "cannot interpret dtype of [{dtype}] in [{obj}]".format(
  1598. dtype=dtype, obj=self))
  1599. # set my typ if we need
  1600. if self.typ is None:
  1601. self.typ = getattr(self.description, self.cname, None)
  1602. def set_atom(self, block, block_items, existing_col, min_itemsize,
  1603. nan_rep, info, encoding=None, errors='strict'):
  1604. """ create and setup my atom from the block b """
  1605. self.values = list(block_items)
  1606. # short-cut certain block types
  1607. if block.is_categorical:
  1608. return self.set_atom_categorical(block, items=block_items,
  1609. info=info)
  1610. elif block.is_datetimetz:
  1611. return self.set_atom_datetime64tz(block, info=info)
  1612. elif block.is_datetime:
  1613. return self.set_atom_datetime64(block)
  1614. elif block.is_timedelta:
  1615. return self.set_atom_timedelta64(block)
  1616. elif block.is_complex:
  1617. return self.set_atom_complex(block)
  1618. dtype = block.dtype.name
  1619. inferred_type = lib.infer_dtype(block.values, skipna=False)
  1620. if inferred_type == 'date':
  1621. raise TypeError(
  1622. "[date] is not implemented as a table column")
  1623. elif inferred_type == 'datetime':
  1624. # after 8260
  1625. # this only would be hit for a mutli-timezone dtype
  1626. # which is an error
  1627. raise TypeError(
  1628. "too many timezones in this block, create separate "
  1629. "data columns"
  1630. )
  1631. elif inferred_type == 'unicode':
  1632. raise TypeError(
  1633. "[unicode] is not implemented as a table column")
  1634. # this is basically a catchall; if say a datetime64 has nans then will
  1635. # end up here ###
  1636. elif inferred_type == 'string' or dtype == 'object':
  1637. self.set_atom_string(
  1638. block, block_items,
  1639. existing_col,
  1640. min_itemsize,
  1641. nan_rep,
  1642. encoding,
  1643. errors)
  1644. # set as a data block
  1645. else:
  1646. self.set_atom_data(block)
  1647. def get_atom_string(self, block, itemsize):
  1648. return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
  1649. def set_atom_string(self, block, block_items, existing_col, min_itemsize,
  1650. nan_rep, encoding, errors):
  1651. # fill nan items with myself, don't disturb the blocks by
  1652. # trying to downcast
  1653. block = block.fillna(nan_rep, downcast=False)
  1654. if isinstance(block, list):
  1655. block = block[0]
  1656. data = block.values
  1657. # see if we have a valid string type
  1658. inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
  1659. if inferred_type != 'string':
  1660. # we cannot serialize this data, so report an exception on a column
  1661. # by column basis
  1662. for i, item in enumerate(block_items):
  1663. col = block.iget(i)
  1664. inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
  1665. if inferred_type != 'string':
  1666. raise TypeError(
  1667. "Cannot serialize the column [{item}] because\n"
  1668. "its data contents are [{type}] object dtype".format(
  1669. item=item, type=inferred_type)
  1670. )
  1671. # itemsize is the maximum length of a string (along any dimension)
  1672. data_converted = _convert_string_array(data, encoding, errors)
  1673. itemsize = data_converted.itemsize
  1674. # specified min_itemsize?
  1675. if isinstance(min_itemsize, dict):
  1676. min_itemsize = int(min_itemsize.get(
  1677. self.name) or min_itemsize.get('values') or 0)
  1678. itemsize = max(min_itemsize or 0, itemsize)
  1679. # check for column in the values conflicts
  1680. if existing_col is not None:
  1681. eci = existing_col.validate_col(itemsize)
  1682. if eci > itemsize:
  1683. itemsize = eci
  1684. self.itemsize = itemsize
  1685. self.kind = 'string'
  1686. self.typ = self.get_atom_string(block, itemsize)
  1687. self.set_data(data_converted.astype(
  1688. '|S{size}'.format(size=itemsize), copy=False))
  1689. def get_atom_coltype(self, kind=None):
  1690. """ return the PyTables column class for this column """
  1691. if kind is None:
  1692. kind = self.kind
  1693. if self.kind.startswith('uint'):
  1694. col_name = "UInt{name}Col".format(name=kind[4:])
  1695. else:
  1696. col_name = "{name}Col".format(name=kind.capitalize())
  1697. return getattr(_tables(), col_name)
  1698. def get_atom_data(self, block, kind=None):
  1699. return self.get_atom_coltype(kind=kind)(shape=block.shape[0])
  1700. def set_atom_complex(self, block):
  1701. self.kind = block.dtype.name
  1702. itemsize = int(self.kind.split('complex')[-1]) // 8
  1703. self.typ = _tables().ComplexCol(
  1704. itemsize=itemsize, shape=block.shape[0])
  1705. self.set_data(block.values.astype(self.typ.type, copy=False))
  1706. def set_atom_data(self, block):
  1707. self.kind = block.dtype.name
  1708. self.typ = self.get_atom_data(block)
  1709. self.set_data(block.values.astype(self.typ.type, copy=False))
  1710. def set_atom_categorical(self, block, items, info=None, values=None):
  1711. # currently only supports a 1-D categorical
  1712. # in a 1-D block
  1713. values = block.values
  1714. codes = values.codes
  1715. self.kind = 'integer'
  1716. self.dtype = codes.dtype.name
  1717. if values.ndim > 1:
  1718. raise NotImplementedError("only support 1-d categoricals")
  1719. if len(items) > 1:
  1720. raise NotImplementedError("only support single block categoricals")
  1721. # write the codes; must be in a block shape
  1722. self.ordered = values.ordered
  1723. self.typ = self.get_atom_data(block, kind=codes.dtype.name)
  1724. self.set_data(_block_shape(codes))
  1725. # write the categories
  1726. self.meta = 'category'
  1727. self.set_metadata(block.values.categories)
  1728. # update the info
  1729. self.update_info(info)
  1730. def get_atom_datetime64(self, block):
  1731. return _tables().Int64Col(shape=block.shape[0])
  1732. def set_atom_datetime64(self, block, values=None):
  1733. self.kind = 'datetime64'
  1734. self.typ = self.get_atom_datetime64(block)
  1735. if values is None:
  1736. values = block.values.view('i8')
  1737. self.set_data(values, 'datetime64')
  1738. def set_atom_datetime64tz(self, block, info, values=None):
  1739. if values is None:
  1740. values = block.values
  1741. # convert this column to i8 in UTC, and save the tz
  1742. values = values.asi8.reshape(block.shape)
  1743. # store a converted timezone
  1744. self.tz = _get_tz(block.values.tz)
  1745. self.update_info(info)
  1746. self.kind = 'datetime64'
  1747. self.typ = self.get_atom_datetime64(block)
  1748. self.set_data(values, 'datetime64')
  1749. def get_atom_timedelta64(self, block):
  1750. return _tables().Int64Col(shape=block.shape[0])
  1751. def set_atom_timedelta64(self, block, values=None):
  1752. self.kind = 'timedelta64'
  1753. self.typ = self.get_atom_timedelta64(block)
  1754. if values is None:
  1755. values = block.values.view('i8')
  1756. self.set_data(values, 'timedelta64')
  1757. @property
  1758. def shape(self):
  1759. return getattr(self.data, 'shape', None)
  1760. @property
  1761. def cvalues(self):
  1762. """ return my cython values """
  1763. return self.data
  1764. def validate_attr(self, append):
  1765. """validate that we have the same order as the existing & same dtype"""
  1766. if append:
  1767. existing_fields = getattr(self.attrs, self.kind_attr, None)
  1768. if (existing_fields is not None and
  1769. existing_fields != list(self.values)):
  1770. raise ValueError("appended items do not match existing items"
  1771. " in table!")
  1772. existing_dtype = getattr(self.attrs, self.dtype_attr, None)
  1773. if (existing_dtype is not None and
  1774. existing_dtype != self.dtype):
  1775. raise ValueError("appended items dtype do not match existing "
  1776. "items dtype in table!")
  1777. def convert(self, values, nan_rep, encoding, errors):
  1778. """set the data from this selection (and convert to the correct dtype
  1779. if we can)
  1780. """
  1781. # values is a recarray
  1782. if values.dtype.fields is not None:
  1783. values = values[self.cname]
  1784. self.set_data(values)
  1785. # use the meta if needed
  1786. meta = _ensure_decoded(self.meta)
  1787. # convert to the correct dtype
  1788. if self.dtype is not None:
  1789. dtype = _ensure_decoded(self.dtype)
  1790. # reverse converts
  1791. if dtype == u'datetime64':
  1792. # recreate with tz if indicated
  1793. self.data = _set_tz(self.data, self.tz, coerce=True)
  1794. elif dtype == u'timedelta64':
  1795. self.data = np.asarray(self.data, dtype='m8[ns]')
  1796. elif dtype == u'date':
  1797. try:
  1798. self.data = np.asarray(
  1799. [date.fromordinal(v) for v in self.data], dtype=object)
  1800. except ValueError:
  1801. self.data = np.asarray(
  1802. [date.fromtimestamp(v) for v in self.data],
  1803. dtype=object)
  1804. elif dtype == u'datetime':
  1805. self.data = np.asarray(
  1806. [datetime.fromtimestamp(v) for v in self.data],
  1807. dtype=object)
  1808. elif meta == u'category':
  1809. # we have a categorical
  1810. categories = self.metadata
  1811. codes = self.data.ravel()
  1812. # if we have stored a NaN in the categories
  1813. # then strip it; in theory we could have BOTH
  1814. # -1s in the codes and nulls :<
  1815. if categories is None:
  1816. # Handle case of NaN-only categorical columns in which case
  1817. # the categories are an empty array; when this is stored,
  1818. # pytables cannot write a zero-len array, so on readback
  1819. # the categories would be None and `read_hdf()` would fail.
  1820. categories = Index([], dtype=np.float64)
  1821. else:
  1822. mask = isna(categories)
  1823. if mask.any():
  1824. categories = categories[~mask]
  1825. codes[codes != -1] -= mask.astype(int).cumsum().values
  1826. self.data = Categorical.from_codes(codes,
  1827. categories=categories,
  1828. ordered=self.ordered)
  1829. else:
  1830. try:
  1831. self.data = self.data.astype(dtype, copy=False)
  1832. except TypeError:
  1833. self.data = self.data.astype('O', copy=False)
  1834. # convert nans / decode
  1835. if _ensure_decoded(self.kind) == u'string':
  1836. self.data = _unconvert_string_array(
  1837. self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)
  1838. return self
  1839. def get_attr(self):
  1840. """ get the data for this column """
  1841. self.values = getattr(self.attrs, self.kind_attr, None)
  1842. self.dtype = getattr(self.attrs, self.dtype_attr, None)
  1843. self.meta = getattr(self.attrs, self.meta_attr, None)
  1844. self.set_kind()
  1845. def set_attr(self):
  1846. """ set the data for this column """
  1847. setattr(self.attrs, self.kind_attr, self.values)
  1848. setattr(self.attrs, self.meta_attr, self.meta)
  1849. if self.dtype is not None:
  1850. setattr(self.attrs, self.dtype_attr, self.dtype)
  1851. class DataIndexableCol(DataCol):
  1852. """ represent a data column that can be indexed """
  1853. is_data_indexable = True
  1854. def validate_names(self):
  1855. if not Index(self.values).is_object():
  1856. raise ValueError("cannot have non-object label DataIndexableCol")
  1857. def get_atom_string(self, block, itemsize):
  1858. return _tables().StringCol(itemsize=itemsize)
  1859. def get_atom_data(self, block, kind=None):
  1860. return self.get_atom_coltype(kind=kind)()
  1861. def get_atom_datetime64(self, block):
  1862. return _tables().Int64Col()
  1863. def get_atom_timedelta64(self, block):
  1864. return _tables().Int64Col()
  1865. class GenericDataIndexableCol(DataIndexableCol):
  1866. """ represent a generic pytables data column """
  1867. def get_attr(self):
  1868. pass
  1869. class Fixed(StringMixin):
  1870. """ represent an object in my store
  1871. facilitate read/write of various types of objects
  1872. this is an abstract base class
  1873. Parameters
  1874. ----------
  1875. parent : my parent HDFStore
  1876. group : the group node where the table resides
  1877. """
  1878. pandas_kind = None
  1879. obj_type = None
  1880. ndim = None
  1881. is_table = False
  1882. def __init__(self, parent, group, encoding=None, errors='strict',
  1883. **kwargs):
  1884. self.parent = parent
  1885. self.group = group
  1886. self.encoding = _ensure_encoding(encoding)
  1887. self.errors = errors
  1888. self.set_version()
  1889. @property
  1890. def is_old_version(self):
  1891. return (self.version[0] <= 0 and self.version[1] <= 10 and
  1892. self.version[2] < 1)
  1893. def set_version(self):
  1894. """ compute and set our version """
  1895. version = _ensure_decoded(
  1896. getattr(self.group._v_attrs, 'pandas_version', None))
  1897. try:
  1898. self.version = tuple(int(x) for x in version.split('.'))
  1899. if len(self.version) == 2:
  1900. self.version = self.version + (0,)
  1901. except AttributeError:
  1902. self.version = (0, 0, 0)
  1903. @property
  1904. def pandas_type(self):
  1905. return _ensure_decoded(getattr(self.group._v_attrs,
  1906. 'pandas_type', None))
  1907. @property
  1908. def format_type(self):
  1909. return 'fixed'
  1910. def __unicode__(self):
  1911. """ return a pretty representation of myself """
  1912. self.infer_axes()
  1913. s = self.shape
  1914. if s is not None:
  1915. if isinstance(s, (list, tuple)):
  1916. s = "[{shape}]".format(
  1917. shape=','.join(pprint_thing(x) for x in s))
  1918. return "{type:12.12} (shape->{shape})".format(
  1919. type=self.pandas_type, shape=s)
  1920. return self.pandas_type
  1921. def set_object_info(self):
  1922. """ set my pandas type & version """
  1923. self.attrs.pandas_type = str(self.pandas_kind)
  1924. self.attrs.pandas_version = str(_version)
  1925. self.set_version()
  1926. def copy(self):
  1927. new_self = copy.copy(self)
  1928. return new_self
  1929. @property
  1930. def storage_obj_type(self):
  1931. return self.obj_type
  1932. @property
  1933. def shape(self):
  1934. return self.nrows
  1935. @property
  1936. def pathname(self):
  1937. return self.group._v_pathname
  1938. @property
  1939. def _handle(self):
  1940. return self.parent._handle
  1941. @property
  1942. def _filters(self):
  1943. return self.parent._filters
  1944. @property
  1945. def _complevel(self):
  1946. return self.parent._complevel
  1947. @property
  1948. def _fletcher32(self):
  1949. return self.parent._fletcher32
  1950. @property
  1951. def _complib(self):
  1952. return self.parent._complib
  1953. @property
  1954. def attrs(self):
  1955. return self.group._v_attrs
  1956. def set_attrs(self):
  1957. """ set our object attributes """
  1958. pass
  1959. def get_attrs(self):
  1960. """ get our object attributes """
  1961. pass
  1962. @property
  1963. def storable(self):
  1964. """ return my storable """
  1965. return self.group
  1966. @property
  1967. def is_exists(self):
  1968. return False
  1969. @property
  1970. def nrows(self):
  1971. return getattr(self.storable, 'nrows', None)
  1972. def validate(self, other):
  1973. """ validate against an existing storable """
  1974. if other is None:
  1975. return
  1976. return True
  1977. def validate_version(self, where=None):
  1978. """ are we trying to operate on an old version? """
  1979. return True
  1980. def infer_axes(self):
  1981. """ infer the axes of my storer
  1982. return a boolean indicating if we have a valid storer or not """
  1983. s = self.storable
  1984. if s is None:
  1985. return False
  1986. self.get_attrs()
  1987. return True
  1988. def read(self, **kwargs):
  1989. raise NotImplementedError(
  1990. "cannot read on an abstract storer: subclasses should implement")
  1991. def write(self, **kwargs):
  1992. raise NotImplementedError(
  1993. "cannot write on an abstract storer: sublcasses should implement")
  1994. def delete(self, where=None, start=None, stop=None, **kwargs):
  1995. """
  1996. support fully deleting the node in its entirety (only) - where
  1997. specification must be None
  1998. """
  1999. if com._all_none(where, start, stop):
  2000. self._handle.remove_node(self.group, recursive=True)
  2001. return None
  2002. raise TypeError("cannot delete on an abstract storer")
  2003. class GenericFixed(Fixed):
  2004. """ a generified fixed version """
  2005. _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'}
  2006. _reverse_index_map = {v: k for k, v in compat.iteritems(_index_type_map)}
  2007. attributes = []
  2008. # indexer helpders
  2009. def _class_to_alias(self, cls):
  2010. return self._index_type_map.get(cls, '')
  2011. def _alias_to_class(self, alias):
  2012. if isinstance(alias, type): # pragma: no cover
  2013. # compat: for a short period of time master stored types
  2014. return alias
  2015. return self._reverse_index_map.get(alias, Index)
  2016. def _get_index_factory(self, klass):
  2017. if klass == DatetimeIndex:
  2018. def f(values, freq=None, tz=None):
  2019. # data are already in UTC, localize and convert if tz present
  2020. result = DatetimeIndex._simple_new(values.values, name=None,
  2021. freq=freq)
  2022. if tz is not None:
  2023. result = result.tz_localize('UTC').tz_convert(tz)
  2024. return result
  2025. return f
  2026. elif klass == PeriodIndex:
  2027. def f(values, freq=None, tz=None):
  2028. return PeriodIndex._simple_new(values, name=None, freq=freq)
  2029. return f
  2030. return klass
  2031. def validate_read(self, kwargs):
  2032. """
  2033. remove table keywords from kwargs and return
  2034. raise if any keywords are passed which are not-None
  2035. """
  2036. kwargs = copy.copy(kwargs)
  2037. columns = kwargs.pop('columns', None)
  2038. if columns is not None:
  2039. raise TypeError("cannot pass a column specification when reading "
  2040. "a Fixed format store. this store must be "
  2041. "selected in its entirety")
  2042. where = kwargs.pop('where', None)
  2043. if where is not None:
  2044. raise TypeError("cannot pass a where specification when reading "
  2045. "from a Fixed format store. this store must be "
  2046. "selected in its entirety")
  2047. return kwargs
  2048. @property
  2049. def is_exists(self):
  2050. return True
  2051. def set_attrs(self):
  2052. """ set our object attributes """
  2053. self.attrs.encoding = self.encoding
  2054. self.attrs.errors = self.errors
  2055. def get_attrs(self):
  2056. """ retrieve our attributes """
  2057. self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None))
  2058. self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
  2059. for n in self.attributes:
  2060. setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
  2061. def write(self, obj, **kwargs):
  2062. self.set_attrs()
  2063. def read_array(self, key, start=None, stop=None):
  2064. """ read an array for the specified node (off of group """
  2065. import tables
  2066. node = getattr(self.group, key)
  2067. attrs = node._v_attrs
  2068. transposed = getattr(attrs, 'transposed', False)
  2069. if isinstance(node, tables.VLArray):
  2070. ret = node[0][start:stop]
  2071. else:
  2072. dtype = getattr(attrs, 'value_type', None)
  2073. shape = getattr(attrs, 'shape', None)
  2074. if shape is not None:
  2075. # length 0 axis
  2076. ret = np.empty(shape, dtype=dtype)
  2077. else:
  2078. ret = node[start:stop]
  2079. if dtype == u'datetime64':
  2080. # reconstruct a timezone if indicated
  2081. ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)
  2082. elif dtype == u'timedelta64':
  2083. ret = np.asarray(ret, dtype='m8[ns]')
  2084. if transposed:
  2085. return ret.T
  2086. else:
  2087. return ret
  2088. def read_index(self, key, **kwargs):
  2089. variety = _ensure_decoded(
  2090. getattr(self.attrs, '{key}_variety'.format(key=key)))
  2091. if variety == u'multi':
  2092. return self.read_multi_index(key, **kwargs)
  2093. elif variety == u'block':
  2094. return self.read_block_index(key, **kwargs)
  2095. elif variety == u'sparseint':
  2096. return self.read_sparse_intindex(key, **kwargs)
  2097. elif variety == u'regular':
  2098. _, index = self.read_index_node(getattr(self.group, key), **kwargs)
  2099. return index
  2100. else: # pragma: no cover
  2101. raise TypeError(
  2102. 'unrecognized index variety: {variety}'.format(
  2103. variety=variety))
  2104. def write_index(self, key, index):
  2105. if isinstance(index, MultiIndex):
  2106. setattr(self.attrs, '{key}_variety'.format(key=key), 'multi')
  2107. self.write_multi_index(key, index)
  2108. elif isinstance(index, BlockIndex):
  2109. setattr(self.attrs, '{key}_variety'.format(key=key), 'block')
  2110. self.write_block_index(key, index)
  2111. elif isinstance(index, IntIndex):
  2112. setattr(self.attrs, '{key}_variety'.format(key=key), 'sparseint')
  2113. self.write_sparse_intindex(key, index)
  2114. else:
  2115. setattr(self.attrs, '{key}_variety'.format(key=key), 'regular')
  2116. converted = _convert_index(index, self.encoding, self.errors,
  2117. self.format_type).set_name('index')
  2118. self.write_array(key, converted.values)
  2119. node = getattr(self.group, key)
  2120. node._v_attrs.kind = converted.kind
  2121. node._v_attrs.name = index.name
  2122. if isinstance(index, (DatetimeIndex, PeriodIndex)):
  2123. node._v_attrs.index_class = self._class_to_alias(type(index))
  2124. if hasattr(index, 'freq'):
  2125. node._v_attrs.freq = index.freq
  2126. if hasattr(index, 'tz') and index.tz is not None:
  2127. node._v_attrs.tz = _get_tz(index.tz)
  2128. def write_block_index(self, key, index):
  2129. self.write_array('{key}_blocs'.format(key=key), index.blocs)
  2130. self.write_array('{key}_blengths'.format(key=key), index.blengths)
  2131. setattr(self.attrs, '{key}_length'.format(key=key), index.length)
  2132. def read_block_index(self, key, **kwargs):
  2133. length = getattr(self.attrs, '{key}_length'.format(key=key))
  2134. blocs = self.read_array('{key}_blocs'.format(key=key), **kwargs)
  2135. blengths = self.read_array('{key}_blengths'.format(key=key), **kwargs)
  2136. return BlockIndex(length, blocs, blengths)
  2137. def write_sparse_intindex(self, key, index):
  2138. self.write_array('{key}_indices'.format(key=key), index.indices)
  2139. setattr(self.attrs, '{key}_length'.format(key=key), index.length)
  2140. def read_sparse_intindex(self, key, **kwargs):
  2141. length = getattr(self.attrs, '{key}_length'.format(key=key))
  2142. indices = self.read_array('{key}_indices'.format(key=key), **kwargs)
  2143. return IntIndex(length, indices)
  2144. def write_multi_index(self, key, index):
  2145. setattr(self.attrs, '{key}_nlevels'.format(key=key), index.nlevels)
  2146. for i, (lev, level_codes, name) in enumerate(zip(index.levels,
  2147. index.codes,
  2148. index.names)):
  2149. # write the level
  2150. level_key = '{key}_level{idx}'.format(key=key, idx=i)
  2151. conv_level = _convert_index(lev, self.encoding, self.errors,
  2152. self.format_type).set_name(level_key)
  2153. self.write_array(level_key, conv_level.values)
  2154. node = getattr(self.group, level_key)
  2155. node._v_attrs.kind = conv_level.kind
  2156. node._v_attrs.name = name
  2157. # write the name
  2158. setattr(node._v_attrs, '{key}_name{name}'.format(
  2159. key=key, name=name), name)
  2160. # write the labels
  2161. label_key = '{key}_label{idx}'.format(key=key, idx=i)
  2162. self.write_array(label_key, level_codes)
  2163. def read_multi_index(self, key, **kwargs):
  2164. nlevels = getattr(self.attrs, '{key}_nlevels'.format(key=key))
  2165. levels = []
  2166. codes = []
  2167. names = []
  2168. for i in range(nlevels):
  2169. level_key = '{key}_level{idx}'.format(key=key, idx=i)
  2170. name, lev = self.read_index_node(getattr(self.group, level_key),
  2171. **kwargs)
  2172. levels.append(lev)
  2173. names.append(name)
  2174. label_key = '{key}_label{idx}'.format(key=key, idx=i)
  2175. level_codes = self.read_array(label_key, **kwargs)
  2176. codes.append(level_codes)
  2177. return MultiIndex(levels=levels, codes=codes, names=names,
  2178. verify_integrity=True)
  2179. def read_index_node(self, node, start=None, stop=None):
  2180. data = node[start:stop]
  2181. # If the index was an empty array write_array_empty() will
  2182. # have written a sentinel. Here we relace it with the original.
  2183. if ('shape' in node._v_attrs and
  2184. self._is_empty_array(getattr(node._v_attrs, 'shape'))):
  2185. data = np.empty(getattr(node._v_attrs, 'shape'),
  2186. dtype=getattr(node._v_attrs, 'value_type'))
  2187. kind = _ensure_decoded(node._v_attrs.kind)
  2188. name = None
  2189. if 'name' in node._v_attrs:
  2190. name = _ensure_str(node._v_attrs.name)
  2191. name = _ensure_decoded(name)
  2192. index_class = self._alias_to_class(_ensure_decoded(
  2193. getattr(node._v_attrs, 'index_class', '')))
  2194. factory = self._get_index_factory(index_class)
  2195. kwargs = {}
  2196. if u'freq' in node._v_attrs:
  2197. kwargs['freq'] = node._v_attrs['freq']
  2198. if u'tz' in node._v_attrs:
  2199. kwargs['tz'] = node._v_attrs['tz']
  2200. if kind in (u'date', u'datetime'):
  2201. index = factory(_unconvert_index(data, kind,
  2202. encoding=self.encoding,
  2203. errors=self.errors),
  2204. dtype=object, **kwargs)
  2205. else:
  2206. index = factory(_unconvert_index(data, kind,
  2207. encoding=self.encoding,
  2208. errors=self.errors), **kwargs)
  2209. index.name = name
  2210. return name, index
  2211. def write_array_empty(self, key, value):
  2212. """ write a 0-len array """
  2213. # ugly hack for length 0 axes
  2214. arr = np.empty((1,) * value.ndim)
  2215. self._handle.create_array(self.group, key, arr)
  2216. getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
  2217. getattr(self.group, key)._v_attrs.shape = value.shape
  2218. def _is_empty_array(self, shape):
  2219. """Returns true if any axis is zero length."""
  2220. return any(x == 0 for x in shape)
  2221. def write_array(self, key, value, items=None):
  2222. if key in self.group:
  2223. self._handle.remove_node(self.group, key)
  2224. # Transform needed to interface with pytables row/col notation
  2225. empty_array = self._is_empty_array(value.shape)
  2226. transposed = False
  2227. if is_categorical_dtype(value):
  2228. raise NotImplementedError('Cannot store a category dtype in '
  2229. 'a HDF5 dataset that uses format='
  2230. '"fixed". Use format="table".')
  2231. if not empty_array:
  2232. if hasattr(value, 'T'):
  2233. # ExtensionArrays (1d) may not have transpose.
  2234. value = value.T
  2235. transposed = True
  2236. if self._filters is not None:
  2237. atom = None
  2238. try:
  2239. # get the atom for this datatype
  2240. atom = _tables().Atom.from_dtype(value.dtype)
  2241. except ValueError:
  2242. pass
  2243. if atom is not None:
  2244. # create an empty chunked array and fill it from value
  2245. if not empty_array:
  2246. ca = self._handle.create_carray(self.group, key, atom,
  2247. value.shape,
  2248. filters=self._filters)
  2249. ca[:] = value
  2250. getattr(self.group, key)._v_attrs.transposed = transposed
  2251. else:
  2252. self.write_array_empty(key, value)
  2253. return
  2254. if value.dtype.type == np.object_:
  2255. # infer the type, warn if we have a non-string type here (for
  2256. # performance)
  2257. inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
  2258. if empty_array:
  2259. pass
  2260. elif inferred_type == 'string':
  2261. pass
  2262. else:
  2263. try:
  2264. items = list(items)
  2265. except TypeError:
  2266. pass
  2267. ws = performance_doc % (inferred_type, key, items)
  2268. warnings.warn(ws, PerformanceWarning, stacklevel=7)
  2269. vlarr = self._handle.create_vlarray(self.group, key,
  2270. _tables().ObjectAtom())
  2271. vlarr.append(value)
  2272. else:
  2273. if empty_array:
  2274. self.write_array_empty(key, value)
  2275. else:
  2276. if is_datetime64_dtype(value.dtype):
  2277. self._handle.create_array(
  2278. self.group, key, value.view('i8'))
  2279. getattr(
  2280. self.group, key)._v_attrs.value_type = 'datetime64'
  2281. elif is_datetime64tz_dtype(value.dtype):
  2282. # store as UTC
  2283. # with a zone
  2284. self._handle.create_array(self.group, key,
  2285. value.asi8)
  2286. node = getattr(self.group, key)
  2287. node._v_attrs.tz = _get_tz(value.tz)
  2288. node._v_attrs.value_type = 'datetime64'
  2289. elif is_timedelta64_dtype(value.dtype):
  2290. self._handle.create_array(
  2291. self.group, key, value.view('i8'))
  2292. getattr(
  2293. self.group, key)._v_attrs.value_type = 'timedelta64'
  2294. else:
  2295. self._handle.create_array(self.group, key, value)
  2296. getattr(self.group, key)._v_attrs.transposed = transposed
  2297. class LegacyFixed(GenericFixed):
  2298. def read_index_legacy(self, key, start=None, stop=None):
  2299. node = getattr(self.group, key)
  2300. data = node[start:stop]
  2301. kind = node._v_attrs.kind
  2302. return _unconvert_index_legacy(data, kind, encoding=self.encoding,
  2303. errors=self.errors)
  2304. class LegacySeriesFixed(LegacyFixed):
  2305. def read(self, **kwargs):
  2306. kwargs = self.validate_read(kwargs)
  2307. index = self.read_index_legacy('index')
  2308. values = self.read_array('values')
  2309. return Series(values, index=index)
  2310. class LegacyFrameFixed(LegacyFixed):
  2311. def read(self, **kwargs):
  2312. kwargs = self.validate_read(kwargs)
  2313. index = self.read_index_legacy('index')
  2314. columns = self.read_index_legacy('columns')
  2315. values = self.read_array('values')
  2316. return DataFrame(values, index=index, columns=columns)
  2317. class SeriesFixed(GenericFixed):
  2318. pandas_kind = u'series'
  2319. attributes = ['name']
  2320. @property
  2321. def shape(self):
  2322. try:
  2323. return len(getattr(self.group, 'values')),
  2324. except (TypeError, AttributeError):
  2325. return None
  2326. def read(self, **kwargs):
  2327. kwargs = self.validate_read(kwargs)
  2328. index = self.read_index('index', **kwargs)
  2329. values = self.read_array('values', **kwargs)
  2330. return Series(values, index=index, name=self.name)
  2331. def write(self, obj, **kwargs):
  2332. super(SeriesFixed, self).write(obj, **kwargs)
  2333. self.write_index('index', obj.index)
  2334. self.write_array('values', obj.values)
  2335. self.attrs.name = obj.name
  2336. class SparseFixed(GenericFixed):
  2337. def validate_read(self, kwargs):
  2338. """
  2339. we don't support start, stop kwds in Sparse
  2340. """
  2341. kwargs = super(SparseFixed, self).validate_read(kwargs)
  2342. if 'start' in kwargs or 'stop' in kwargs:
  2343. raise NotImplementedError("start and/or stop are not supported "
  2344. "in fixed Sparse reading")
  2345. return kwargs
  2346. class SparseSeriesFixed(SparseFixed):
  2347. pandas_kind = u'sparse_series'
  2348. attributes = ['name', 'fill_value', 'kind']
  2349. def read(self, **kwargs):
  2350. kwargs = self.validate_read(kwargs)
  2351. index = self.read_index('index')
  2352. sp_values = self.read_array('sp_values')
  2353. sp_index = self.read_index('sp_index')
  2354. return SparseSeries(sp_values, index=index, sparse_index=sp_index,
  2355. kind=self.kind or u'block',
  2356. fill_value=self.fill_value,
  2357. name=self.name)
  2358. def write(self, obj, **kwargs):
  2359. super(SparseSeriesFixed, self).write(obj, **kwargs)
  2360. self.write_index('index', obj.index)
  2361. self.write_index('sp_index', obj.sp_index)
  2362. self.write_array('sp_values', obj.sp_values)
  2363. self.attrs.name = obj.name
  2364. self.attrs.fill_value = obj.fill_value
  2365. self.attrs.kind = obj.kind
  2366. class SparseFrameFixed(SparseFixed):
  2367. pandas_kind = u'sparse_frame'
  2368. attributes = ['default_kind', 'default_fill_value']
  2369. def read(self, **kwargs):
  2370. kwargs = self.validate_read(kwargs)
  2371. columns = self.read_index('columns')
  2372. sdict = {}
  2373. for c in columns:
  2374. key = 'sparse_series_{columns}'.format(columns=c)
  2375. s = SparseSeriesFixed(self.parent, getattr(self.group, key))
  2376. s.infer_axes()
  2377. sdict[c] = s.read()
  2378. return SparseDataFrame(sdict, columns=columns,
  2379. default_kind=self.default_kind,
  2380. default_fill_value=self.default_fill_value)
  2381. def write(self, obj, **kwargs):
  2382. """ write it as a collection of individual sparse series """
  2383. super(SparseFrameFixed, self).write(obj, **kwargs)
  2384. for name, ss in compat.iteritems(obj):
  2385. key = 'sparse_series_{name}'.format(name=name)
  2386. if key not in self.group._v_children:
  2387. node = self._handle.create_group(self.group, key)
  2388. else:
  2389. node = getattr(self.group, key)
  2390. s = SparseSeriesFixed(self.parent, node)
  2391. s.write(ss)
  2392. self.attrs.default_fill_value = obj.default_fill_value
  2393. self.attrs.default_kind = obj.default_kind
  2394. self.write_index('columns', obj.columns)
  2395. class BlockManagerFixed(GenericFixed):
  2396. attributes = ['ndim', 'nblocks']
  2397. is_shape_reversed = False
  2398. @property
  2399. def shape(self):
  2400. try:
  2401. ndim = self.ndim
  2402. # items
  2403. items = 0
  2404. for i in range(self.nblocks):
  2405. node = getattr(self.group, 'block{idx}_items'.format(idx=i))
  2406. shape = getattr(node, 'shape', None)
  2407. if shape is not None:
  2408. items += shape[0]
  2409. # data shape
  2410. node = getattr(self.group, 'block0_values')
  2411. shape = getattr(node, 'shape', None)
  2412. if shape is not None:
  2413. shape = list(shape[0:(ndim - 1)])
  2414. else:
  2415. shape = []
  2416. shape.append(items)
  2417. # hacky - this works for frames, but is reversed for panels
  2418. if self.is_shape_reversed:
  2419. shape = shape[::-1]
  2420. return shape
  2421. except AttributeError:
  2422. return None
  2423. def read(self, start=None, stop=None, **kwargs):
  2424. # start, stop applied to rows, so 0th axis only
  2425. kwargs = self.validate_read(kwargs)
  2426. select_axis = self.obj_type()._get_block_manager_axis(0)
  2427. axes = []
  2428. for i in range(self.ndim):
  2429. _start, _stop = (start, stop) if i == select_axis else (None, None)
  2430. ax = self.read_index('axis{idx}'.format(
  2431. idx=i), start=_start, stop=_stop)
  2432. axes.append(ax)
  2433. items = axes[0]
  2434. blocks = []
  2435. for i in range(self.nblocks):
  2436. blk_items = self.read_index('block{idx}_items'.format(idx=i))
  2437. values = self.read_array('block{idx}_values'.format(idx=i),
  2438. start=_start, stop=_stop)
  2439. blk = make_block(values,
  2440. placement=items.get_indexer(blk_items))
  2441. blocks.append(blk)
  2442. return self.obj_type(BlockManager(blocks, axes))
  2443. def write(self, obj, **kwargs):
  2444. super(BlockManagerFixed, self).write(obj, **kwargs)
  2445. data = obj._data
  2446. if not data.is_consolidated():
  2447. data = data.consolidate()
  2448. self.attrs.ndim = data.ndim
  2449. for i, ax in enumerate(data.axes):
  2450. if i == 0:
  2451. if not ax.is_unique:
  2452. raise ValueError(
  2453. "Columns index has to be unique for fixed format")
  2454. self.write_index('axis{idx}'.format(idx=i), ax)
  2455. # Supporting mixed-type DataFrame objects...nontrivial
  2456. self.attrs.nblocks = len(data.blocks)
  2457. for i, blk in enumerate(data.blocks):
  2458. # I have no idea why, but writing values before items fixed #2299
  2459. blk_items = data.items.take(blk.mgr_locs)
  2460. self.write_array('block{idx}_values'.format(idx=i),
  2461. blk.values, items=blk_items)
  2462. self.write_index('block{idx}_items'.format(idx=i), blk_items)
  2463. class FrameFixed(BlockManagerFixed):
  2464. pandas_kind = u'frame'
  2465. obj_type = DataFrame
  2466. class PanelFixed(BlockManagerFixed):
  2467. pandas_kind = u'wide'
  2468. obj_type = Panel
  2469. is_shape_reversed = True
  2470. def write(self, obj, **kwargs):
  2471. obj._consolidate_inplace()
  2472. return super(PanelFixed, self).write(obj, **kwargs)
  2473. class Table(Fixed):
  2474. """ represent a table:
  2475. facilitate read/write of various types of tables
  2476. Attrs in Table Node
  2477. -------------------
  2478. These are attributes that are store in the main table node, they are
  2479. necessary to recreate these tables when read back in.
  2480. index_axes : a list of tuples of the (original indexing axis and
  2481. index column)
  2482. non_index_axes: a list of tuples of the (original index axis and
  2483. columns on a non-indexing axis)
  2484. values_axes : a list of the columns which comprise the data of this
  2485. table
  2486. data_columns : a list of the columns that we are allowing indexing
  2487. (these become single columns in values_axes), or True to force all
  2488. columns
  2489. nan_rep : the string to use for nan representations for string
  2490. objects
  2491. levels : the names of levels
  2492. metadata : the names of the metadata columns
  2493. """
  2494. pandas_kind = u'wide_table'
  2495. table_type = None
  2496. levels = 1
  2497. is_table = True
  2498. is_shape_reversed = False
  2499. def __init__(self, *args, **kwargs):
  2500. super(Table, self).__init__(*args, **kwargs)
  2501. self.index_axes = []
  2502. self.non_index_axes = []
  2503. self.values_axes = []
  2504. self.data_columns = []
  2505. self.metadata = []
  2506. self.info = dict()
  2507. self.nan_rep = None
  2508. self.selection = None
  2509. @property
  2510. def table_type_short(self):
  2511. return self.table_type.split('_')[0]
  2512. @property
  2513. def format_type(self):
  2514. return 'table'
  2515. def __unicode__(self):
  2516. """ return a pretty representatgion of myself """
  2517. self.infer_axes()
  2518. dc = ",dc->[{columns}]".format(columns=(','.join(
  2519. self.data_columns) if len(self.data_columns) else ''))
  2520. ver = ''
  2521. if self.is_old_version:
  2522. ver = "[{version}]".format(
  2523. version='.'.join(str(x) for x in self.version))
  2524. return (
  2525. "{pandas_type:12.12}{ver} (typ->{table_type},nrows->{nrows},"
  2526. "ncols->{ncols},indexers->[{index_axes}]{dc})".format(
  2527. pandas_type=self.pandas_type, ver=ver,
  2528. table_type=self.table_type_short, nrows=self.nrows,
  2529. ncols=self.ncols,
  2530. index_axes=(','.join(a.name for a in self.index_axes)), dc=dc
  2531. ))
  2532. def __getitem__(self, c):
  2533. """ return the axis for c """
  2534. for a in self.axes:
  2535. if c == a.name:
  2536. return a
  2537. return None
  2538. def validate(self, other):
  2539. """ validate against an existing table """
  2540. if other is None:
  2541. return
  2542. if other.table_type != self.table_type:
  2543. raise TypeError(
  2544. "incompatible table_type with existing "
  2545. "[{other} - {self}]".format(
  2546. other=other.table_type, self=self.table_type))
  2547. for c in ['index_axes', 'non_index_axes', 'values_axes']:
  2548. sv = getattr(self, c, None)
  2549. ov = getattr(other, c, None)
  2550. if sv != ov:
  2551. # show the error for the specific axes
  2552. for i, sax in enumerate(sv):
  2553. oax = ov[i]
  2554. if sax != oax:
  2555. raise ValueError(
  2556. "invalid combinate of [{c}] on appending data "
  2557. "[{sax}] vs current table [{oax}]".format(
  2558. c=c, sax=sax, oax=oax))
  2559. # should never get here
  2560. raise Exception(
  2561. "invalid combinate of [{c}] on appending data [{sv}] vs "
  2562. "current table [{ov}]".format(c=c, sv=sv, ov=ov))
  2563. @property
  2564. def is_multi_index(self):
  2565. """the levels attribute is 1 or a list in the case of a multi-index"""
  2566. return isinstance(self.levels, list)
  2567. def validate_metadata(self, existing):
  2568. """ create / validate metadata """
  2569. self.metadata = [
  2570. c.name for c in self.values_axes if c.metadata is not None]
  2571. def validate_multiindex(self, obj):
  2572. """validate that we can store the multi-index; reset and return the
  2573. new object
  2574. """
  2575. levels = [l if l is not None else "level_{0}".format(i)
  2576. for i, l in enumerate(obj.index.names)]
  2577. try:
  2578. return obj.reset_index(), levels
  2579. except ValueError:
  2580. raise ValueError("duplicate names/columns in the multi-index when "
  2581. "storing as a table")
  2582. @property
  2583. def nrows_expected(self):
  2584. """ based on our axes, compute the expected nrows """
  2585. return np.prod([i.cvalues.shape[0] for i in self.index_axes])
  2586. @property
  2587. def is_exists(self):
  2588. """ has this table been created """
  2589. return u'table' in self.group
  2590. @property
  2591. def storable(self):
  2592. return getattr(self.group, 'table', None)
  2593. @property
  2594. def table(self):
  2595. """ return the table group (this is my storable) """
  2596. return self.storable
  2597. @property
  2598. def dtype(self):
  2599. return self.table.dtype
  2600. @property
  2601. def description(self):
  2602. return self.table.description
  2603. @property
  2604. def axes(self):
  2605. return itertools.chain(self.index_axes, self.values_axes)
  2606. @property
  2607. def ncols(self):
  2608. """ the number of total columns in the values axes """
  2609. return sum(len(a.values) for a in self.values_axes)
  2610. @property
  2611. def is_transposed(self):
  2612. return False
  2613. @property
  2614. def data_orientation(self):
  2615. """return a tuple of my permutated axes, non_indexable at the front"""
  2616. return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes],
  2617. [int(a.axis) for a in self.index_axes]))
  2618. def queryables(self):
  2619. """ return a dict of the kinds allowable columns for this object """
  2620. # compute the values_axes queryables
  2621. return dict(
  2622. [(a.cname, a) for a in self.index_axes] +
  2623. [(self.storage_obj_type._AXIS_NAMES[axis], None)
  2624. for axis, values in self.non_index_axes] +
  2625. [(v.cname, v) for v in self.values_axes
  2626. if v.name in set(self.data_columns)]
  2627. )
  2628. def index_cols(self):
  2629. """ return a list of my index cols """
  2630. return [(i.axis, i.cname) for i in self.index_axes]
  2631. def values_cols(self):
  2632. """ return a list of my values cols """
  2633. return [i.cname for i in self.values_axes]
  2634. def _get_metadata_path(self, key):
  2635. """ return the metadata pathname for this key """
  2636. return "{group}/meta/{key}/meta".format(group=self.group._v_pathname,
  2637. key=key)
  2638. def write_metadata(self, key, values):
  2639. """
  2640. write out a meta data array to the key as a fixed-format Series
  2641. Parameters
  2642. ----------
  2643. key : string
  2644. values : ndarray
  2645. """
  2646. values = Series(values)
  2647. self.parent.put(self._get_metadata_path(key), values, format='table',
  2648. encoding=self.encoding, errors=self.errors,
  2649. nan_rep=self.nan_rep)
  2650. def read_metadata(self, key):
  2651. """ return the meta data array for this key """
  2652. if getattr(getattr(self.group, 'meta', None), key, None) is not None:
  2653. return self.parent.select(self._get_metadata_path(key))
  2654. return None
  2655. def set_info(self):
  2656. """ update our table index info """
  2657. self.attrs.info = self.info
  2658. def set_attrs(self):
  2659. """ set our table type & indexables """
  2660. self.attrs.table_type = str(self.table_type)
  2661. self.attrs.index_cols = self.index_cols()
  2662. self.attrs.values_cols = self.values_cols()
  2663. self.attrs.non_index_axes = self.non_index_axes
  2664. self.attrs.data_columns = self.data_columns
  2665. self.attrs.nan_rep = self.nan_rep
  2666. self.attrs.encoding = self.encoding
  2667. self.attrs.errors = self.errors
  2668. self.attrs.levels = self.levels
  2669. self.attrs.metadata = self.metadata
  2670. self.set_info()
  2671. def get_attrs(self):
  2672. """ retrieve our attributes """
  2673. self.non_index_axes = getattr(
  2674. self.attrs, 'non_index_axes', None) or []
  2675. self.data_columns = getattr(
  2676. self.attrs, 'data_columns', None) or []
  2677. self.info = getattr(
  2678. self.attrs, 'info', None) or dict()
  2679. self.nan_rep = getattr(self.attrs, 'nan_rep', None)
  2680. self.encoding = _ensure_encoding(
  2681. getattr(self.attrs, 'encoding', None))
  2682. self.errors = _ensure_decoded(getattr(self.attrs, 'errors', 'strict'))
  2683. self.levels = getattr(
  2684. self.attrs, 'levels', None) or []
  2685. self.index_axes = [
  2686. a.infer(self) for a in self.indexables if a.is_an_indexable
  2687. ]
  2688. self.values_axes = [
  2689. a.infer(self) for a in self.indexables if not a.is_an_indexable
  2690. ]
  2691. self.metadata = getattr(
  2692. self.attrs, 'metadata', None) or []
  2693. def validate_version(self, where=None):
  2694. """ are we trying to operate on an old version? """
  2695. if where is not None:
  2696. if (self.version[0] <= 0 and self.version[1] <= 10 and
  2697. self.version[2] < 1):
  2698. ws = incompatibility_doc % '.'.join(
  2699. [str(x) for x in self.version])
  2700. warnings.warn(ws, IncompatibilityWarning)
  2701. def validate_min_itemsize(self, min_itemsize):
  2702. """validate the min_itemisze doesn't contain items that are not in the
  2703. axes this needs data_columns to be defined
  2704. """
  2705. if min_itemsize is None:
  2706. return
  2707. if not isinstance(min_itemsize, dict):
  2708. return
  2709. q = self.queryables()
  2710. for k, v in min_itemsize.items():
  2711. # ok, apply generally
  2712. if k == 'values':
  2713. continue
  2714. if k not in q:
  2715. raise ValueError(
  2716. "min_itemsize has the key [{key}] which is not an axis or "
  2717. "data_column".format(key=k))
  2718. @property
  2719. def indexables(self):
  2720. """ create/cache the indexables if they don't exist """
  2721. if self._indexables is None:
  2722. self._indexables = []
  2723. # index columns
  2724. self._indexables.extend([
  2725. IndexCol(name=name, axis=axis, pos=i)
  2726. for i, (axis, name) in enumerate(self.attrs.index_cols)
  2727. ])
  2728. # values columns
  2729. dc = set(self.data_columns)
  2730. base_pos = len(self._indexables)
  2731. def f(i, c):
  2732. klass = DataCol
  2733. if c in dc:
  2734. klass = DataIndexableCol
  2735. return klass.create_for_block(i=i, name=c, pos=base_pos + i,
  2736. version=self.version)
  2737. self._indexables.extend(
  2738. [f(i, c) for i, c in enumerate(self.attrs.values_cols)])
  2739. return self._indexables
  2740. def create_index(self, columns=None, optlevel=None, kind=None):
  2741. """
  2742. Create a pytables index on the specified columns
  2743. note: cannot index Time64Col() or ComplexCol currently;
  2744. PyTables must be >= 3.0
  2745. Parameters
  2746. ----------
  2747. columns : False (don't create an index), True (create all columns
  2748. index), None or list_like (the indexers to index)
  2749. optlevel: optimization level (defaults to 6)
  2750. kind : kind of index (defaults to 'medium')
  2751. Exceptions
  2752. ----------
  2753. raises if the node is not a table
  2754. """
  2755. if not self.infer_axes():
  2756. return
  2757. if columns is False:
  2758. return
  2759. # index all indexables and data_columns
  2760. if columns is None or columns is True:
  2761. columns = [a.cname for a in self.axes if a.is_data_indexable]
  2762. if not isinstance(columns, (tuple, list)):
  2763. columns = [columns]
  2764. kw = dict()
  2765. if optlevel is not None:
  2766. kw['optlevel'] = optlevel
  2767. if kind is not None:
  2768. kw['kind'] = kind
  2769. table = self.table
  2770. for c in columns:
  2771. v = getattr(table.cols, c, None)
  2772. if v is not None:
  2773. # remove the index if the kind/optlevel have changed
  2774. if v.is_indexed:
  2775. index = v.index
  2776. cur_optlevel = index.optlevel
  2777. cur_kind = index.kind
  2778. if kind is not None and cur_kind != kind:
  2779. v.remove_index()
  2780. else:
  2781. kw['kind'] = cur_kind
  2782. if optlevel is not None and cur_optlevel != optlevel:
  2783. v.remove_index()
  2784. else:
  2785. kw['optlevel'] = cur_optlevel
  2786. # create the index
  2787. if not v.is_indexed:
  2788. if v.type.startswith('complex'):
  2789. raise TypeError(
  2790. 'Columns containing complex values can be stored '
  2791. 'but cannot'
  2792. ' be indexed when using table format. Either use '
  2793. 'fixed format, set index=False, or do not include '
  2794. 'the columns containing complex values to '
  2795. 'data_columns when initializing the table.')
  2796. v.create_index(**kw)
  2797. def read_axes(self, where, **kwargs):
  2798. """create and return the axes sniffed from the table: return boolean
  2799. for success
  2800. """
  2801. # validate the version
  2802. self.validate_version(where)
  2803. # infer the data kind
  2804. if not self.infer_axes():
  2805. return False
  2806. # create the selection
  2807. self.selection = Selection(self, where=where, **kwargs)
  2808. values = self.selection.select()
  2809. # convert the data
  2810. for a in self.axes:
  2811. a.set_info(self.info)
  2812. a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
  2813. errors=self.errors)
  2814. return True
  2815. def get_object(self, obj):
  2816. """ return the data for this obj """
  2817. return obj
  2818. def validate_data_columns(self, data_columns, min_itemsize):
  2819. """take the input data_columns and min_itemize and create a data
  2820. columns spec
  2821. """
  2822. if not len(self.non_index_axes):
  2823. return []
  2824. axis, axis_labels = self.non_index_axes[0]
  2825. info = self.info.get(axis, dict())
  2826. if info.get('type') == 'MultiIndex' and data_columns:
  2827. raise ValueError("cannot use a multi-index on axis [{0}] with "
  2828. "data_columns {1}".format(axis, data_columns))
  2829. # evaluate the passed data_columns, True == use all columns
  2830. # take only valide axis labels
  2831. if data_columns is True:
  2832. data_columns = list(axis_labels)
  2833. elif data_columns is None:
  2834. data_columns = []
  2835. # if min_itemsize is a dict, add the keys (exclude 'values')
  2836. if isinstance(min_itemsize, dict):
  2837. existing_data_columns = set(data_columns)
  2838. data_columns.extend([
  2839. k for k in min_itemsize.keys()
  2840. if k != 'values' and k not in existing_data_columns
  2841. ])
  2842. # return valid columns in the order of our axis
  2843. return [c for c in data_columns if c in axis_labels]
  2844. def create_axes(self, axes, obj, validate=True, nan_rep=None,
  2845. data_columns=None, min_itemsize=None, **kwargs):
  2846. """ create and return the axes
  2847. leagcy tables create an indexable column, indexable index,
  2848. non-indexable fields
  2849. Parameters:
  2850. -----------
  2851. axes: a list of the axes in order to create (names or numbers of
  2852. the axes)
  2853. obj : the object to create axes on
  2854. validate: validate the obj against an existing object already
  2855. written
  2856. min_itemsize: a dict of the min size for a column in bytes
  2857. nan_rep : a values to use for string column nan_rep
  2858. encoding : the encoding for string values
  2859. data_columns : a list of columns that we want to create separate to
  2860. allow indexing (or True will force all columns)
  2861. """
  2862. # set the default axes if needed
  2863. if axes is None:
  2864. try:
  2865. axes = _AXES_MAP[type(obj)]
  2866. except KeyError:
  2867. raise TypeError(
  2868. "cannot properly create the storer for: [group->{group},"
  2869. "value->{value}]".format(
  2870. group=self.group._v_name, value=type(obj)))
  2871. # map axes to numbers
  2872. axes = [obj._get_axis_number(a) for a in axes]
  2873. # do we have an existing table (if so, use its axes & data_columns)
  2874. if self.infer_axes():
  2875. existing_table = self.copy()
  2876. existing_table.infer_axes()
  2877. axes = [a.axis for a in existing_table.index_axes]
  2878. data_columns = existing_table.data_columns
  2879. nan_rep = existing_table.nan_rep
  2880. self.encoding = existing_table.encoding
  2881. self.errors = existing_table.errors
  2882. self.info = copy.copy(existing_table.info)
  2883. else:
  2884. existing_table = None
  2885. # currently support on ndim-1 axes
  2886. if len(axes) != self.ndim - 1:
  2887. raise ValueError(
  2888. "currently only support ndim-1 indexers in an AppendableTable")
  2889. # create according to the new data
  2890. self.non_index_axes = []
  2891. self.data_columns = []
  2892. # nan_representation
  2893. if nan_rep is None:
  2894. nan_rep = 'nan'
  2895. self.nan_rep = nan_rep
  2896. # create axes to index and non_index
  2897. index_axes_map = dict()
  2898. for i, a in enumerate(obj.axes):
  2899. if i in axes:
  2900. name = obj._AXIS_NAMES[i]
  2901. index_axes_map[i] = _convert_index(
  2902. a, self.encoding, self.errors, self.format_type
  2903. ).set_name(name).set_axis(i)
  2904. else:
  2905. # we might be able to change the axes on the appending data if
  2906. # necessary
  2907. append_axis = list(a)
  2908. if existing_table is not None:
  2909. indexer = len(self.non_index_axes)
  2910. exist_axis = existing_table.non_index_axes[indexer][1]
  2911. if not array_equivalent(np.array(append_axis),
  2912. np.array(exist_axis)):
  2913. # ahah! -> reindex
  2914. if array_equivalent(np.array(sorted(append_axis)),
  2915. np.array(sorted(exist_axis))):
  2916. append_axis = exist_axis
  2917. # the non_index_axes info
  2918. info = _get_info(self.info, i)
  2919. info['names'] = list(a.names)
  2920. info['type'] = a.__class__.__name__
  2921. self.non_index_axes.append((i, append_axis))
  2922. # set axis positions (based on the axes)
  2923. self.index_axes = [
  2924. index_axes_map[a].set_pos(j).update_info(self.info)
  2925. for j, a in enumerate(axes)
  2926. ]
  2927. j = len(self.index_axes)
  2928. # check for column conflicts
  2929. for a in self.axes:
  2930. a.maybe_set_size(min_itemsize=min_itemsize)
  2931. # reindex by our non_index_axes & compute data_columns
  2932. for a in self.non_index_axes:
  2933. obj = _reindex_axis(obj, a[0], a[1])
  2934. def get_blk_items(mgr, blocks):
  2935. return [mgr.items.take(blk.mgr_locs) for blk in blocks]
  2936. # figure out data_columns and get out blocks
  2937. block_obj = self.get_object(obj)._consolidate()
  2938. blocks = block_obj._data.blocks
  2939. blk_items = get_blk_items(block_obj._data, blocks)
  2940. if len(self.non_index_axes):
  2941. axis, axis_labels = self.non_index_axes[0]
  2942. data_columns = self.validate_data_columns(
  2943. data_columns, min_itemsize)
  2944. if len(data_columns):
  2945. mgr = block_obj.reindex(
  2946. Index(axis_labels).difference(Index(data_columns)),
  2947. axis=axis
  2948. )._data
  2949. blocks = list(mgr.blocks)
  2950. blk_items = get_blk_items(mgr, blocks)
  2951. for c in data_columns:
  2952. mgr = block_obj.reindex([c], axis=axis)._data
  2953. blocks.extend(mgr.blocks)
  2954. blk_items.extend(get_blk_items(mgr, mgr.blocks))
  2955. # reorder the blocks in the same order as the existing_table if we can
  2956. if existing_table is not None:
  2957. by_items = {tuple(b_items.tolist()): (b, b_items)
  2958. for b, b_items in zip(blocks, blk_items)}
  2959. new_blocks = []
  2960. new_blk_items = []
  2961. for ea in existing_table.values_axes:
  2962. items = tuple(ea.values)
  2963. try:
  2964. b, b_items = by_items.pop(items)
  2965. new_blocks.append(b)
  2966. new_blk_items.append(b_items)
  2967. except (IndexError, KeyError):
  2968. raise ValueError(
  2969. "cannot match existing table structure for [{items}] "
  2970. "on appending data".format(
  2971. items=(','.join(pprint_thing(item) for
  2972. item in items))))
  2973. blocks = new_blocks
  2974. blk_items = new_blk_items
  2975. # add my values
  2976. self.values_axes = []
  2977. for i, (b, b_items) in enumerate(zip(blocks, blk_items)):
  2978. # shape of the data column are the indexable axes
  2979. klass = DataCol
  2980. name = None
  2981. # we have a data_column
  2982. if (data_columns and len(b_items) == 1 and
  2983. b_items[0] in data_columns):
  2984. klass = DataIndexableCol
  2985. name = b_items[0]
  2986. self.data_columns.append(name)
  2987. # make sure that we match up the existing columns
  2988. # if we have an existing table
  2989. if existing_table is not None and validate:
  2990. try:
  2991. existing_col = existing_table.values_axes[i]
  2992. except (IndexError, KeyError):
  2993. raise ValueError(
  2994. "Incompatible appended table [{blocks}]"
  2995. "with existing table [{table}]".format(
  2996. blocks=blocks,
  2997. table=existing_table.values_axes))
  2998. else:
  2999. existing_col = None
  3000. try:
  3001. col = klass.create_for_block(
  3002. i=i, name=name, version=self.version)
  3003. col.set_atom(block=b, block_items=b_items,
  3004. existing_col=existing_col,
  3005. min_itemsize=min_itemsize,
  3006. nan_rep=nan_rep,
  3007. encoding=self.encoding,
  3008. errors=self.errors,
  3009. info=self.info)
  3010. col.set_pos(j)
  3011. self.values_axes.append(col)
  3012. except (NotImplementedError, ValueError, TypeError) as e:
  3013. raise e
  3014. except Exception as detail:
  3015. raise Exception(
  3016. "cannot find the correct atom type -> "
  3017. "[dtype->{name},items->{items}] {detail!s}".format(
  3018. name=b.dtype.name, items=b_items, detail=detail))
  3019. j += 1
  3020. # validate our min_itemsize
  3021. self.validate_min_itemsize(min_itemsize)
  3022. # validate our metadata
  3023. self.validate_metadata(existing_table)
  3024. # validate the axes if we have an existing table
  3025. if validate:
  3026. self.validate(existing_table)
  3027. def process_axes(self, obj, columns=None):
  3028. """ process axes filters """
  3029. # make a copy to avoid side effects
  3030. if columns is not None:
  3031. columns = list(columns)
  3032. # make sure to include levels if we have them
  3033. if columns is not None and self.is_multi_index:
  3034. for n in self.levels:
  3035. if n not in columns:
  3036. columns.insert(0, n)
  3037. # reorder by any non_index_axes & limit to the select columns
  3038. for axis, labels in self.non_index_axes:
  3039. obj = _reindex_axis(obj, axis, labels, columns)
  3040. # apply the selection filters (but keep in the same order)
  3041. if self.selection.filter is not None:
  3042. for field, op, filt in self.selection.filter.format():
  3043. def process_filter(field, filt):
  3044. for axis_name in obj._AXIS_NAMES.values():
  3045. axis_number = obj._get_axis_number(axis_name)
  3046. axis_values = obj._get_axis(axis_name)
  3047. # see if the field is the name of an axis
  3048. if field == axis_name:
  3049. # if we have a multi-index, then need to include
  3050. # the levels
  3051. if self.is_multi_index:
  3052. filt = filt.union(Index(self.levels))
  3053. takers = op(axis_values, filt)
  3054. return obj.loc._getitem_axis(takers,
  3055. axis=axis_number)
  3056. # this might be the name of a file IN an axis
  3057. elif field in axis_values:
  3058. # we need to filter on this dimension
  3059. values = ensure_index(getattr(obj, field).values)
  3060. filt = ensure_index(filt)
  3061. # hack until we support reversed dim flags
  3062. if isinstance(obj, DataFrame):
  3063. axis_number = 1 - axis_number
  3064. takers = op(values, filt)
  3065. return obj.loc._getitem_axis(takers,
  3066. axis=axis_number)
  3067. raise ValueError("cannot find the field [{field}] for "
  3068. "filtering!".format(field=field))
  3069. obj = process_filter(field, filt)
  3070. return obj
  3071. def create_description(self, complib=None, complevel=None,
  3072. fletcher32=False, expectedrows=None):
  3073. """ create the description of the table from the axes & values """
  3074. # provided expected rows if its passed
  3075. if expectedrows is None:
  3076. expectedrows = max(self.nrows_expected, 10000)
  3077. d = dict(name='table', expectedrows=expectedrows)
  3078. # description from the axes & values
  3079. d['description'] = {a.cname: a.typ for a in self.axes}
  3080. if complib:
  3081. if complevel is None:
  3082. complevel = self._complevel or 9
  3083. filters = _tables().Filters(
  3084. complevel=complevel, complib=complib,
  3085. fletcher32=fletcher32 or self._fletcher32)
  3086. d['filters'] = filters
  3087. elif self._filters is not None:
  3088. d['filters'] = self._filters
  3089. return d
  3090. def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
  3091. """select coordinates (row numbers) from a table; return the
  3092. coordinates object
  3093. """
  3094. # validate the version
  3095. self.validate_version(where)
  3096. # infer the data kind
  3097. if not self.infer_axes():
  3098. return False
  3099. # create the selection
  3100. self.selection = Selection(
  3101. self, where=where, start=start, stop=stop, **kwargs)
  3102. coords = self.selection.select_coords()
  3103. if self.selection.filter is not None:
  3104. for field, op, filt in self.selection.filter.format():
  3105. data = self.read_column(
  3106. field, start=coords.min(), stop=coords.max() + 1)
  3107. coords = coords[
  3108. op(data.iloc[coords - coords.min()], filt).values]
  3109. return Index(coords)
  3110. def read_column(self, column, where=None, start=None, stop=None):
  3111. """return a single column from the table, generally only indexables
  3112. are interesting
  3113. """
  3114. # validate the version
  3115. self.validate_version()
  3116. # infer the data kind
  3117. if not self.infer_axes():
  3118. return False
  3119. if where is not None:
  3120. raise TypeError("read_column does not currently accept a where "
  3121. "clause")
  3122. # find the axes
  3123. for a in self.axes:
  3124. if column == a.name:
  3125. if not a.is_data_indexable:
  3126. raise ValueError(
  3127. "column [{column}] can not be extracted individually; "
  3128. "it is not data indexable".format(column=column))
  3129. # column must be an indexable or a data column
  3130. c = getattr(self.table.cols, column)
  3131. a.set_info(self.info)
  3132. return Series(_set_tz(a.convert(c[start:stop],
  3133. nan_rep=self.nan_rep,
  3134. encoding=self.encoding,
  3135. errors=self.errors
  3136. ).take_data(),
  3137. a.tz, True), name=column)
  3138. raise KeyError(
  3139. "column [{column}] not found in the table".format(column=column))
  3140. class WORMTable(Table):
  3141. """ a write-once read-many table: this format DOES NOT ALLOW appending to a
  3142. table. writing is a one-time operation the data are stored in a format
  3143. that allows for searching the data on disk
  3144. """
  3145. table_type = u'worm'
  3146. def read(self, **kwargs):
  3147. """ read the indices and the indexing array, calculate offset rows and
  3148. return """
  3149. raise NotImplementedError("WORMTable needs to implement read")
  3150. def write(self, **kwargs):
  3151. """ write in a format that we can search later on (but cannot append
  3152. to): write out the indices and the values using _write_array
  3153. (e.g. a CArray) create an indexing table so that we can search
  3154. """
  3155. raise NotImplementedError("WORKTable needs to implement write")
  3156. class LegacyTable(Table):
  3157. """ an appendable table: allow append/query/delete operations to a
  3158. (possibly) already existing appendable table this table ALLOWS
  3159. append (but doesn't require them), and stores the data in a format
  3160. that can be easily searched
  3161. """
  3162. _indexables = [
  3163. IndexCol(name='index', axis=1, pos=0),
  3164. IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'),
  3165. DataCol(name='fields', cname='values', kind_attr='fields', pos=2)
  3166. ]
  3167. table_type = u'legacy'
  3168. ndim = 3
  3169. def write(self, **kwargs):
  3170. raise TypeError("write operations are not allowed on legacy tables!")
  3171. def read(self, where=None, columns=None, **kwargs):
  3172. """we have n indexable columns, with an arbitrary number of data
  3173. axes
  3174. """
  3175. if not self.read_axes(where=where, **kwargs):
  3176. return None
  3177. lst_vals = [a.values for a in self.index_axes]
  3178. labels, levels = _factorize_from_iterables(lst_vals)
  3179. # labels and levels are tuples but lists are expected
  3180. labels = list(labels)
  3181. levels = list(levels)
  3182. N = [len(lvl) for lvl in levels]
  3183. # compute the key
  3184. key = _factor_indexer(N[1:], labels)
  3185. objs = []
  3186. if len(unique(key)) == len(key):
  3187. sorter, _ = algos.groupsort_indexer(
  3188. ensure_int64(key), np.prod(N))
  3189. sorter = ensure_platform_int(sorter)
  3190. # create the objs
  3191. for c in self.values_axes:
  3192. # the data need to be sorted
  3193. sorted_values = c.take_data().take(sorter, axis=0)
  3194. if sorted_values.ndim == 1:
  3195. sorted_values = sorted_values.reshape(
  3196. (sorted_values.shape[0], 1))
  3197. take_labels = [l.take(sorter) for l in labels]
  3198. items = Index(c.values)
  3199. block = _block2d_to_blocknd(
  3200. values=sorted_values, placement=np.arange(len(items)),
  3201. shape=tuple(N), labels=take_labels, ref_items=items)
  3202. # create the object
  3203. mgr = BlockManager([block], [items] + levels)
  3204. obj = self.obj_type(mgr)
  3205. # permute if needed
  3206. if self.is_transposed:
  3207. obj = obj.transpose(
  3208. *tuple(Series(self.data_orientation).argsort()))
  3209. objs.append(obj)
  3210. else:
  3211. warnings.warn(duplicate_doc, DuplicateWarning, stacklevel=5)
  3212. # reconstruct
  3213. long_index = MultiIndex.from_arrays(
  3214. [i.values for i in self.index_axes])
  3215. for c in self.values_axes:
  3216. lp = DataFrame(c.data, index=long_index, columns=c.values)
  3217. # need a better algorithm
  3218. tuple_index = long_index.values
  3219. unique_tuples = unique(tuple_index)
  3220. unique_tuples = com.asarray_tuplesafe(unique_tuples)
  3221. indexer = match(unique_tuples, tuple_index)
  3222. indexer = ensure_platform_int(indexer)
  3223. new_index = long_index.take(indexer)
  3224. new_values = lp.values.take(indexer, axis=0)
  3225. lp = DataFrame(new_values, index=new_index, columns=lp.columns)
  3226. objs.append(lp.to_panel())
  3227. # create the composite object
  3228. if len(objs) == 1:
  3229. wp = objs[0]
  3230. else:
  3231. wp = concat(objs, axis=0, verify_integrity=False)._consolidate()
  3232. # apply the selection filters & axis orderings
  3233. wp = self.process_axes(wp, columns=columns)
  3234. return wp
  3235. class LegacyFrameTable(LegacyTable):
  3236. """ support the legacy frame table """
  3237. pandas_kind = u'frame_table'
  3238. table_type = u'legacy_frame'
  3239. obj_type = Panel
  3240. def read(self, *args, **kwargs):
  3241. return super(LegacyFrameTable, self).read(*args, **kwargs)['value']
  3242. class LegacyPanelTable(LegacyTable):
  3243. """ support the legacy panel table """
  3244. table_type = u'legacy_panel'
  3245. obj_type = Panel
  3246. class AppendableTable(LegacyTable):
  3247. """ suppor the new appendable table formats """
  3248. _indexables = None
  3249. table_type = u'appendable'
  3250. def write(self, obj, axes=None, append=False, complib=None,
  3251. complevel=None, fletcher32=None, min_itemsize=None,
  3252. chunksize=None, expectedrows=None, dropna=False, **kwargs):
  3253. if not append and self.is_exists:
  3254. self._handle.remove_node(self.group, 'table')
  3255. # create the axes
  3256. self.create_axes(axes=axes, obj=obj, validate=append,
  3257. min_itemsize=min_itemsize,
  3258. **kwargs)
  3259. for a in self.axes:
  3260. a.validate(self, append)
  3261. if not self.is_exists:
  3262. # create the table
  3263. options = self.create_description(complib=complib,
  3264. complevel=complevel,
  3265. fletcher32=fletcher32,
  3266. expectedrows=expectedrows)
  3267. # set the table attributes
  3268. self.set_attrs()
  3269. # create the table
  3270. self._handle.create_table(self.group, **options)
  3271. else:
  3272. pass
  3273. # table = self.table
  3274. # update my info
  3275. self.set_info()
  3276. # validate the axes and set the kinds
  3277. for a in self.axes:
  3278. a.validate_and_set(self, append)
  3279. # add the rows
  3280. self.write_data(chunksize, dropna=dropna)
  3281. def write_data(self, chunksize, dropna=False):
  3282. """ we form the data into a 2-d including indexes,values,mask
  3283. write chunk-by-chunk """
  3284. names = self.dtype.names
  3285. nrows = self.nrows_expected
  3286. # if dropna==True, then drop ALL nan rows
  3287. masks = []
  3288. if dropna:
  3289. for a in self.values_axes:
  3290. # figure the mask: only do if we can successfully process this
  3291. # column, otherwise ignore the mask
  3292. mask = isna(a.data).all(axis=0)
  3293. if isinstance(mask, np.ndarray):
  3294. masks.append(mask.astype('u1', copy=False))
  3295. # consolidate masks
  3296. if len(masks):
  3297. mask = masks[0]
  3298. for m in masks[1:]:
  3299. mask = mask & m
  3300. mask = mask.ravel()
  3301. else:
  3302. mask = None
  3303. # broadcast the indexes if needed
  3304. indexes = [a.cvalues for a in self.index_axes]
  3305. nindexes = len(indexes)
  3306. bindexes = []
  3307. for i, idx in enumerate(indexes):
  3308. # broadcast to all other indexes except myself
  3309. if i > 0 and i < nindexes:
  3310. repeater = np.prod(
  3311. [indexes[bi].shape[0] for bi in range(0, i)])
  3312. idx = np.tile(idx, repeater)
  3313. if i < nindexes - 1:
  3314. repeater = np.prod([indexes[bi].shape[0]
  3315. for bi in range(i + 1, nindexes)])
  3316. idx = np.repeat(idx, repeater)
  3317. bindexes.append(idx)
  3318. # transpose the values so first dimension is last
  3319. # reshape the values if needed
  3320. values = [a.take_data() for a in self.values_axes]
  3321. values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1))
  3322. for v in values]
  3323. bvalues = []
  3324. for i, v in enumerate(values):
  3325. new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
  3326. bvalues.append(values[i].reshape(new_shape))
  3327. # write the chunks
  3328. if chunksize is None:
  3329. chunksize = 100000
  3330. rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
  3331. chunks = int(nrows / chunksize) + 1
  3332. for i in range(chunks):
  3333. start_i = i * chunksize
  3334. end_i = min((i + 1) * chunksize, nrows)
  3335. if start_i >= end_i:
  3336. break
  3337. self.write_data_chunk(
  3338. rows,
  3339. indexes=[a[start_i:end_i] for a in bindexes],
  3340. mask=mask[start_i:end_i] if mask is not None else None,
  3341. values=[v[start_i:end_i] for v in bvalues])
  3342. def write_data_chunk(self, rows, indexes, mask, values):
  3343. """
  3344. Parameters
  3345. ----------
  3346. rows : an empty memory space where we are putting the chunk
  3347. indexes : an array of the indexes
  3348. mask : an array of the masks
  3349. values : an array of the values
  3350. """
  3351. # 0 len
  3352. for v in values:
  3353. if not np.prod(v.shape):
  3354. return
  3355. try:
  3356. nrows = indexes[0].shape[0]
  3357. if nrows != len(rows):
  3358. rows = np.empty(nrows, dtype=self.dtype)
  3359. names = self.dtype.names
  3360. nindexes = len(indexes)
  3361. # indexes
  3362. for i, idx in enumerate(indexes):
  3363. rows[names[i]] = idx
  3364. # values
  3365. for i, v in enumerate(values):
  3366. rows[names[i + nindexes]] = v
  3367. # mask
  3368. if mask is not None:
  3369. m = ~mask.ravel().astype(bool, copy=False)
  3370. if not m.all():
  3371. rows = rows[m]
  3372. except Exception as detail:
  3373. raise Exception(
  3374. "cannot create row-data -> {detail}".format(detail=detail))
  3375. try:
  3376. if len(rows):
  3377. self.table.append(rows)
  3378. self.table.flush()
  3379. except Exception as detail:
  3380. raise TypeError(
  3381. "tables cannot write this data -> {detail}".format(
  3382. detail=detail))
  3383. def delete(self, where=None, start=None, stop=None, **kwargs):
  3384. # delete all rows (and return the nrows)
  3385. if where is None or not len(where):
  3386. if start is None and stop is None:
  3387. nrows = self.nrows
  3388. self._handle.remove_node(self.group, recursive=True)
  3389. else:
  3390. # pytables<3.0 would remove a single row with stop=None
  3391. if stop is None:
  3392. stop = self.nrows
  3393. nrows = self.table.remove_rows(start=start, stop=stop)
  3394. self.table.flush()
  3395. return nrows
  3396. # infer the data kind
  3397. if not self.infer_axes():
  3398. return None
  3399. # create the selection
  3400. table = self.table
  3401. self.selection = Selection(
  3402. self, where, start=start, stop=stop, **kwargs)
  3403. values = self.selection.select_coords()
  3404. # delete the rows in reverse order
  3405. sorted_series = Series(values).sort_values()
  3406. ln = len(sorted_series)
  3407. if ln:
  3408. # construct groups of consecutive rows
  3409. diff = sorted_series.diff()
  3410. groups = list(diff[diff > 1].index)
  3411. # 1 group
  3412. if not len(groups):
  3413. groups = [0]
  3414. # final element
  3415. if groups[-1] != ln:
  3416. groups.append(ln)
  3417. # initial element
  3418. if groups[0] != 0:
  3419. groups.insert(0, 0)
  3420. # we must remove in reverse order!
  3421. pg = groups.pop()
  3422. for g in reversed(groups):
  3423. rows = sorted_series.take(lrange(g, pg))
  3424. table.remove_rows(start=rows[rows.index[0]
  3425. ], stop=rows[rows.index[-1]] + 1)
  3426. pg = g
  3427. self.table.flush()
  3428. # return the number of rows removed
  3429. return ln
  3430. class AppendableFrameTable(AppendableTable):
  3431. """ suppor the new appendable table formats """
  3432. pandas_kind = u'frame_table'
  3433. table_type = u'appendable_frame'
  3434. ndim = 2
  3435. obj_type = DataFrame
  3436. @property
  3437. def is_transposed(self):
  3438. return self.index_axes[0].axis == 1
  3439. def get_object(self, obj):
  3440. """ these are written transposed """
  3441. if self.is_transposed:
  3442. obj = obj.T
  3443. return obj
  3444. def read(self, where=None, columns=None, **kwargs):
  3445. if not self.read_axes(where=where, **kwargs):
  3446. return None
  3447. info = (self.info.get(self.non_index_axes[0][0], dict())
  3448. if len(self.non_index_axes) else dict())
  3449. index = self.index_axes[0].values
  3450. frames = []
  3451. for a in self.values_axes:
  3452. # we could have a multi-index constructor here
  3453. # ensure_index doesn't recognized our list-of-tuples here
  3454. if info.get('type') == 'MultiIndex':
  3455. cols = MultiIndex.from_tuples(a.values)
  3456. else:
  3457. cols = Index(a.values)
  3458. names = info.get('names')
  3459. if names is not None:
  3460. cols.set_names(names, inplace=True)
  3461. if self.is_transposed:
  3462. values = a.cvalues
  3463. index_ = cols
  3464. cols_ = Index(index, name=getattr(index, 'name', None))
  3465. else:
  3466. values = a.cvalues.T
  3467. index_ = Index(index, name=getattr(index, 'name', None))
  3468. cols_ = cols
  3469. # if we have a DataIndexableCol, its shape will only be 1 dim
  3470. if values.ndim == 1 and isinstance(values, np.ndarray):
  3471. values = values.reshape((1, values.shape[0]))
  3472. block = make_block(values, placement=np.arange(len(cols_)))
  3473. mgr = BlockManager([block], [cols_, index_])
  3474. frames.append(DataFrame(mgr))
  3475. if len(frames) == 1:
  3476. df = frames[0]
  3477. else:
  3478. df = concat(frames, axis=1)
  3479. # apply the selection filters & axis orderings
  3480. df = self.process_axes(df, columns=columns)
  3481. return df
  3482. class AppendableSeriesTable(AppendableFrameTable):
  3483. """ support the new appendable table formats """
  3484. pandas_kind = u'series_table'
  3485. table_type = u'appendable_series'
  3486. ndim = 2
  3487. obj_type = Series
  3488. storage_obj_type = DataFrame
  3489. @property
  3490. def is_transposed(self):
  3491. return False
  3492. def get_object(self, obj):
  3493. return obj
  3494. def write(self, obj, data_columns=None, **kwargs):
  3495. """ we are going to write this as a frame table """
  3496. if not isinstance(obj, DataFrame):
  3497. name = obj.name or 'values'
  3498. obj = DataFrame({name: obj}, index=obj.index)
  3499. obj.columns = [name]
  3500. return super(AppendableSeriesTable, self).write(
  3501. obj=obj, data_columns=obj.columns.tolist(), **kwargs)
  3502. def read(self, columns=None, **kwargs):
  3503. is_multi_index = self.is_multi_index
  3504. if columns is not None and is_multi_index:
  3505. for n in self.levels:
  3506. if n not in columns:
  3507. columns.insert(0, n)
  3508. s = super(AppendableSeriesTable, self).read(columns=columns, **kwargs)
  3509. if is_multi_index:
  3510. s.set_index(self.levels, inplace=True)
  3511. s = s.iloc[:, 0]
  3512. # remove the default name
  3513. if s.name == 'values':
  3514. s.name = None
  3515. return s
  3516. class AppendableMultiSeriesTable(AppendableSeriesTable):
  3517. """ support the new appendable table formats """
  3518. pandas_kind = u'series_table'
  3519. table_type = u'appendable_multiseries'
  3520. def write(self, obj, **kwargs):
  3521. """ we are going to write this as a frame table """
  3522. name = obj.name or 'values'
  3523. obj, self.levels = self.validate_multiindex(obj)
  3524. cols = list(self.levels)
  3525. cols.append(name)
  3526. obj.columns = cols
  3527. return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs)
  3528. class GenericTable(AppendableFrameTable):
  3529. """ a table that read/writes the generic pytables table format """
  3530. pandas_kind = u'frame_table'
  3531. table_type = u'generic_table'
  3532. ndim = 2
  3533. obj_type = DataFrame
  3534. @property
  3535. def pandas_type(self):
  3536. return self.pandas_kind
  3537. @property
  3538. def storable(self):
  3539. return getattr(self.group, 'table', None) or self.group
  3540. def get_attrs(self):
  3541. """ retrieve our attributes """
  3542. self.non_index_axes = []
  3543. self.nan_rep = None
  3544. self.levels = []
  3545. self.index_axes = [a.infer(self)
  3546. for a in self.indexables if a.is_an_indexable]
  3547. self.values_axes = [a.infer(self)
  3548. for a in self.indexables if not a.is_an_indexable]
  3549. self.data_columns = [a.name for a in self.values_axes]
  3550. @property
  3551. def indexables(self):
  3552. """ create the indexables from the table description """
  3553. if self._indexables is None:
  3554. d = self.description
  3555. # the index columns is just a simple index
  3556. self._indexables = [GenericIndexCol(name='index', axis=0)]
  3557. for i, n in enumerate(d._v_names):
  3558. dc = GenericDataIndexableCol(
  3559. name=n, pos=i, values=[n], version=self.version)
  3560. self._indexables.append(dc)
  3561. return self._indexables
  3562. def write(self, **kwargs):
  3563. raise NotImplementedError("cannot write on an generic table")
  3564. class AppendableMultiFrameTable(AppendableFrameTable):
  3565. """ a frame with a multi-index """
  3566. table_type = u'appendable_multiframe'
  3567. obj_type = DataFrame
  3568. ndim = 2
  3569. _re_levels = re.compile(r"^level_\d+$")
  3570. @property
  3571. def table_type_short(self):
  3572. return u'appendable_multi'
  3573. def write(self, obj, data_columns=None, **kwargs):
  3574. if data_columns is None:
  3575. data_columns = []
  3576. elif data_columns is True:
  3577. data_columns = obj.columns.tolist()
  3578. obj, self.levels = self.validate_multiindex(obj)
  3579. for n in self.levels:
  3580. if n not in data_columns:
  3581. data_columns.insert(0, n)
  3582. return super(AppendableMultiFrameTable, self).write(
  3583. obj=obj, data_columns=data_columns, **kwargs)
  3584. def read(self, **kwargs):
  3585. df = super(AppendableMultiFrameTable, self).read(**kwargs)
  3586. df = df.set_index(self.levels)
  3587. # remove names for 'level_%d'
  3588. df.index = df.index.set_names([
  3589. None if self._re_levels.search(l) else l for l in df.index.names
  3590. ])
  3591. return df
  3592. class AppendablePanelTable(AppendableTable):
  3593. """ suppor the new appendable table formats """
  3594. table_type = u'appendable_panel'
  3595. ndim = 3
  3596. obj_type = Panel
  3597. def get_object(self, obj):
  3598. """ these are written transposed """
  3599. if self.is_transposed:
  3600. obj = obj.transpose(*self.data_orientation)
  3601. return obj
  3602. @property
  3603. def is_transposed(self):
  3604. return self.data_orientation != tuple(range(self.ndim))
  3605. def _reindex_axis(obj, axis, labels, other=None):
  3606. ax = obj._get_axis(axis)
  3607. labels = ensure_index(labels)
  3608. # try not to reindex even if other is provided
  3609. # if it equals our current index
  3610. if other is not None:
  3611. other = ensure_index(other)
  3612. if (other is None or labels.equals(other)) and labels.equals(ax):
  3613. return obj
  3614. labels = ensure_index(labels.unique())
  3615. if other is not None:
  3616. labels = ensure_index(other.unique()).intersection(labels, sort=False)
  3617. if not labels.equals(ax):
  3618. slicer = [slice(None, None)] * obj.ndim
  3619. slicer[axis] = labels
  3620. obj = obj.loc[tuple(slicer)]
  3621. return obj
  3622. def _get_info(info, name):
  3623. """ get/create the info for this name """
  3624. try:
  3625. idx = info[name]
  3626. except KeyError:
  3627. idx = info[name] = dict()
  3628. return idx
  3629. # tz to/from coercion
  3630. def _get_tz(tz):
  3631. """ for a tz-aware type, return an encoded zone """
  3632. zone = timezones.get_timezone(tz)
  3633. if zone is None:
  3634. zone = tz.utcoffset().total_seconds()
  3635. return zone
  3636. def _set_tz(values, tz, preserve_UTC=False, coerce=False):
  3637. """
  3638. coerce the values to a DatetimeIndex if tz is set
  3639. preserve the input shape if possible
  3640. Parameters
  3641. ----------
  3642. values : ndarray
  3643. tz : string/pickled tz object
  3644. preserve_UTC : boolean,
  3645. preserve the UTC of the result
  3646. coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
  3647. """
  3648. if tz is not None:
  3649. name = getattr(values, 'name', None)
  3650. values = values.ravel()
  3651. tz = timezones.get_timezone(_ensure_decoded(tz))
  3652. values = DatetimeIndex(values, name=name)
  3653. if values.tz is None:
  3654. values = values.tz_localize('UTC').tz_convert(tz)
  3655. if preserve_UTC:
  3656. if tz == 'UTC':
  3657. values = list(values)
  3658. elif coerce:
  3659. values = np.asarray(values, dtype='M8[ns]')
  3660. return values
  3661. def _convert_index(index, encoding=None, errors='strict', format_type=None):
  3662. index_name = getattr(index, 'name', None)
  3663. if isinstance(index, DatetimeIndex):
  3664. converted = index.asi8
  3665. return IndexCol(converted, 'datetime64', _tables().Int64Col(),
  3666. freq=getattr(index, 'freq', None),
  3667. tz=getattr(index, 'tz', None),
  3668. index_name=index_name)
  3669. elif isinstance(index, TimedeltaIndex):
  3670. converted = index.asi8
  3671. return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
  3672. freq=getattr(index, 'freq', None),
  3673. index_name=index_name)
  3674. elif isinstance(index, (Int64Index, PeriodIndex)):
  3675. atom = _tables().Int64Col()
  3676. # avoid to store ndarray of Period objects
  3677. return IndexCol(index._ndarray_values, 'integer', atom,
  3678. freq=getattr(index, 'freq', None),
  3679. index_name=index_name)
  3680. if isinstance(index, MultiIndex):
  3681. raise TypeError('MultiIndex not supported here!')
  3682. inferred_type = lib.infer_dtype(index, skipna=False)
  3683. values = np.asarray(index)
  3684. if inferred_type == 'datetime64':
  3685. converted = values.view('i8')
  3686. return IndexCol(converted, 'datetime64', _tables().Int64Col(),
  3687. freq=getattr(index, 'freq', None),
  3688. tz=getattr(index, 'tz', None),
  3689. index_name=index_name)
  3690. elif inferred_type == 'timedelta64':
  3691. converted = values.view('i8')
  3692. return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
  3693. freq=getattr(index, 'freq', None),
  3694. index_name=index_name)
  3695. elif inferred_type == 'datetime':
  3696. converted = np.asarray([(time.mktime(v.timetuple()) +
  3697. v.microsecond / 1E6) for v in values],
  3698. dtype=np.float64)
  3699. return IndexCol(converted, 'datetime', _tables().Time64Col(),
  3700. index_name=index_name)
  3701. elif inferred_type == 'date':
  3702. converted = np.asarray([v.toordinal() for v in values],
  3703. dtype=np.int32)
  3704. return IndexCol(converted, 'date', _tables().Time32Col(),
  3705. index_name=index_name)
  3706. elif inferred_type == 'string':
  3707. # atom = _tables().ObjectAtom()
  3708. # return np.asarray(values, dtype='O'), 'object', atom
  3709. converted = _convert_string_array(values, encoding, errors)
  3710. itemsize = converted.dtype.itemsize
  3711. return IndexCol(
  3712. converted, 'string', _tables().StringCol(itemsize),
  3713. itemsize=itemsize, index_name=index_name
  3714. )
  3715. elif inferred_type == 'unicode':
  3716. if format_type == 'fixed':
  3717. atom = _tables().ObjectAtom()
  3718. return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
  3719. index_name=index_name)
  3720. raise TypeError(
  3721. "[unicode] is not supported as a in index type for [{0}] formats"
  3722. .format(format_type)
  3723. )
  3724. elif inferred_type == 'integer':
  3725. # take a guess for now, hope the values fit
  3726. atom = _tables().Int64Col()
  3727. return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom,
  3728. index_name=index_name)
  3729. elif inferred_type == 'floating':
  3730. atom = _tables().Float64Col()
  3731. return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom,
  3732. index_name=index_name)
  3733. else: # pragma: no cover
  3734. atom = _tables().ObjectAtom()
  3735. return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
  3736. index_name=index_name)
  3737. def _unconvert_index(data, kind, encoding=None, errors='strict'):
  3738. kind = _ensure_decoded(kind)
  3739. if kind == u'datetime64':
  3740. index = DatetimeIndex(data)
  3741. elif kind == u'timedelta64':
  3742. index = TimedeltaIndex(data)
  3743. elif kind == u'datetime':
  3744. index = np.asarray([datetime.fromtimestamp(v) for v in data],
  3745. dtype=object)
  3746. elif kind == u'date':
  3747. try:
  3748. index = np.asarray(
  3749. [date.fromordinal(v) for v in data], dtype=object)
  3750. except (ValueError):
  3751. index = np.asarray(
  3752. [date.fromtimestamp(v) for v in data], dtype=object)
  3753. elif kind in (u'integer', u'float'):
  3754. index = np.asarray(data)
  3755. elif kind in (u'string'):
  3756. index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
  3757. errors=errors)
  3758. elif kind == u'object':
  3759. index = np.asarray(data[0])
  3760. else: # pragma: no cover
  3761. raise ValueError('unrecognized index type {kind}'.format(kind=kind))
  3762. return index
  3763. def _unconvert_index_legacy(data, kind, legacy=False, encoding=None,
  3764. errors='strict'):
  3765. kind = _ensure_decoded(kind)
  3766. if kind == u'datetime':
  3767. index = to_datetime(data)
  3768. elif kind in (u'integer'):
  3769. index = np.asarray(data, dtype=object)
  3770. elif kind in (u'string'):
  3771. index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
  3772. errors=errors)
  3773. else: # pragma: no cover
  3774. raise ValueError('unrecognized index type {kind}'.format(kind=kind))
  3775. return index
  3776. def _convert_string_array(data, encoding, errors, itemsize=None):
  3777. """
  3778. we take a string-like that is object dtype and coerce to a fixed size
  3779. string type
  3780. Parameters
  3781. ----------
  3782. data : a numpy array of object dtype
  3783. encoding : None or string-encoding
  3784. errors : handler for encoding errors
  3785. itemsize : integer, optional, defaults to the max length of the strings
  3786. Returns
  3787. -------
  3788. data in a fixed-length string dtype, encoded to bytes if needed
  3789. """
  3790. # encode if needed
  3791. if encoding is not None and len(data):
  3792. data = Series(data.ravel()).str.encode(
  3793. encoding, errors).values.reshape(data.shape)
  3794. # create the sized dtype
  3795. if itemsize is None:
  3796. ensured = ensure_object(data.ravel())
  3797. itemsize = max(1, libwriters.max_len_string_array(ensured))
  3798. data = np.asarray(data, dtype="S{size}".format(size=itemsize))
  3799. return data
  3800. def _unconvert_string_array(data, nan_rep=None, encoding=None,
  3801. errors='strict'):
  3802. """
  3803. inverse of _convert_string_array
  3804. Parameters
  3805. ----------
  3806. data : fixed length string dtyped array
  3807. nan_rep : the storage repr of NaN, optional
  3808. encoding : the encoding of the data, optional
  3809. errors : handler for encoding errors, default 'strict'
  3810. Returns
  3811. -------
  3812. an object array of the decoded data
  3813. """
  3814. shape = data.shape
  3815. data = np.asarray(data.ravel(), dtype=object)
  3816. # guard against a None encoding in PY3 (because of a legacy
  3817. # where the passed encoding is actually None)
  3818. encoding = _ensure_encoding(encoding)
  3819. if encoding is not None and len(data):
  3820. itemsize = libwriters.max_len_string_array(ensure_object(data))
  3821. if compat.PY3:
  3822. dtype = "U{0}".format(itemsize)
  3823. else:
  3824. dtype = "S{0}".format(itemsize)
  3825. if isinstance(data[0], compat.binary_type):
  3826. data = Series(data).str.decode(encoding, errors=errors).values
  3827. else:
  3828. data = data.astype(dtype, copy=False).astype(object, copy=False)
  3829. if nan_rep is None:
  3830. nan_rep = 'nan'
  3831. data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
  3832. return data.reshape(shape)
  3833. def _maybe_convert(values, val_kind, encoding, errors):
  3834. if _need_convert(val_kind):
  3835. conv = _get_converter(val_kind, encoding, errors)
  3836. # conv = np.frompyfunc(conv, 1, 1)
  3837. values = conv(values)
  3838. return values
  3839. def _get_converter(kind, encoding, errors):
  3840. kind = _ensure_decoded(kind)
  3841. if kind == 'datetime64':
  3842. return lambda x: np.asarray(x, dtype='M8[ns]')
  3843. elif kind == 'datetime':
  3844. return lambda x: to_datetime(x, cache=True).to_pydatetime()
  3845. elif kind == 'string':
  3846. return lambda x: _unconvert_string_array(x, encoding=encoding,
  3847. errors=errors)
  3848. else: # pragma: no cover
  3849. raise ValueError('invalid kind {kind}'.format(kind=kind))
  3850. def _need_convert(kind):
  3851. kind = _ensure_decoded(kind)
  3852. if kind in (u'datetime', u'datetime64', u'string'):
  3853. return True
  3854. return False
  3855. class Selection(object):
  3856. """
  3857. Carries out a selection operation on a tables.Table object.
  3858. Parameters
  3859. ----------
  3860. table : a Table object
  3861. where : list of Terms (or convertible to)
  3862. start, stop: indices to start and/or stop selection
  3863. """
  3864. def __init__(self, table, where=None, start=None, stop=None):
  3865. self.table = table
  3866. self.where = where
  3867. self.start = start
  3868. self.stop = stop
  3869. self.condition = None
  3870. self.filter = None
  3871. self.terms = None
  3872. self.coordinates = None
  3873. if is_list_like(where):
  3874. # see if we have a passed coordinate like
  3875. try:
  3876. inferred = lib.infer_dtype(where, skipna=False)
  3877. if inferred == 'integer' or inferred == 'boolean':
  3878. where = np.asarray(where)
  3879. if where.dtype == np.bool_:
  3880. start, stop = self.start, self.stop
  3881. if start is None:
  3882. start = 0
  3883. if stop is None:
  3884. stop = self.table.nrows
  3885. self.coordinates = np.arange(start, stop)[where]
  3886. elif issubclass(where.dtype.type, np.integer):
  3887. if ((self.start is not None and
  3888. (where < self.start).any()) or
  3889. (self.stop is not None and
  3890. (where >= self.stop).any())):
  3891. raise ValueError(
  3892. "where must have index locations >= start and "
  3893. "< stop"
  3894. )
  3895. self.coordinates = where
  3896. except ValueError:
  3897. pass
  3898. if self.coordinates is None:
  3899. self.terms = self.generate(where)
  3900. # create the numexpr & the filter
  3901. if self.terms is not None:
  3902. self.condition, self.filter = self.terms.evaluate()
  3903. def generate(self, where):
  3904. """ where can be a : dict,list,tuple,string """
  3905. if where is None:
  3906. return None
  3907. q = self.table.queryables()
  3908. try:
  3909. return Expr(where, queryables=q, encoding=self.table.encoding)
  3910. except NameError:
  3911. # raise a nice message, suggesting that the user should use
  3912. # data_columns
  3913. raise ValueError(
  3914. "The passed where expression: {0}\n"
  3915. " contains an invalid variable reference\n"
  3916. " all of the variable references must be a "
  3917. "reference to\n"
  3918. " an axis (e.g. 'index' or 'columns'), or a "
  3919. "data_column\n"
  3920. " The currently defined references are: {1}\n"
  3921. .format(where, ','.join(q.keys()))
  3922. )
  3923. def select(self):
  3924. """
  3925. generate the selection
  3926. """
  3927. if self.condition is not None:
  3928. return self.table.table.read_where(self.condition.format(),
  3929. start=self.start,
  3930. stop=self.stop)
  3931. elif self.coordinates is not None:
  3932. return self.table.table.read_coordinates(self.coordinates)
  3933. return self.table.table.read(start=self.start, stop=self.stop)
  3934. def select_coords(self):
  3935. """
  3936. generate the selection
  3937. """
  3938. start, stop = self.start, self.stop
  3939. nrows = self.table.nrows
  3940. if start is None:
  3941. start = 0
  3942. elif start < 0:
  3943. start += nrows
  3944. if self.stop is None:
  3945. stop = nrows
  3946. elif stop < 0:
  3947. stop += nrows
  3948. if self.condition is not None:
  3949. return self.table.table.get_where_list(self.condition.format(),
  3950. start=start, stop=stop,
  3951. sort=True)
  3952. elif self.coordinates is not None:
  3953. return self.coordinates
  3954. return np.arange(start, stop)
  3955. # utilities ###
  3956. def timeit(key, df, fn=None, remove=True, **kwargs):
  3957. if fn is None:
  3958. fn = 'timeit.h5'
  3959. store = HDFStore(fn, mode='w')
  3960. store.append(key, df, **kwargs)
  3961. store.close()
  3962. if remove:
  3963. os.remove(fn)