stata.py 107 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989
  1. """
  2. Module contains tools for processing Stata files into DataFrames
  3. The StataReader below was originally written by Joe Presbrey as part of PyDTA.
  4. It has been extended and improved by Skipper Seabold from the Statsmodels
  5. project who also developed the StataWriter and was finally added to pandas in
  6. a once again improved version.
  7. You can find more information on http://presbrey.mit.edu/PyDTA and
  8. http://www.statsmodels.org/devel/
  9. """
  10. from collections import OrderedDict
  11. import datetime
  12. import os
  13. import struct
  14. import sys
  15. import warnings
  16. from dateutil.relativedelta import relativedelta
  17. import numpy as np
  18. from pandas._libs.lib import infer_dtype
  19. from pandas._libs.tslibs import NaT, Timestamp
  20. from pandas._libs.writers import max_len_string_array
  21. from pandas.compat import (
  22. BytesIO, ResourceWarning, lmap, lrange, lzip, range, string_types,
  23. text_type, zip)
  24. from pandas.util._decorators import Appender, deprecate_kwarg
  25. from pandas.core.dtypes.common import (
  26. ensure_object, is_categorical_dtype, is_datetime64_dtype)
  27. from pandas import DatetimeIndex, compat, isna, to_datetime, to_timedelta
  28. from pandas.core.arrays import Categorical
  29. from pandas.core.base import StringMixin
  30. from pandas.core.frame import DataFrame
  31. from pandas.core.series import Series
  32. from pandas.io.common import (
  33. BaseIterator, _stringify_path, get_filepath_or_buffer)
  34. _version_error = ("Version of given Stata file is not 104, 105, 108, "
  35. "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
  36. "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)")
  37. _statafile_processing_params1 = """\
  38. convert_dates : boolean, defaults to True
  39. Convert date variables to DataFrame time values.
  40. convert_categoricals : boolean, defaults to True
  41. Read value labels and convert columns to Categorical/Factor variables."""
  42. _encoding_params = """\
  43. encoding : string, None or encoding
  44. Encoding used to parse the files. None defaults to latin-1."""
  45. _statafile_processing_params2 = """\
  46. index_col : string, optional, default: None
  47. Column to set as index.
  48. convert_missing : boolean, defaults to False
  49. Flag indicating whether to convert missing values to their Stata
  50. representations. If False, missing values are replaced with nan.
  51. If True, columns containing missing values are returned with
  52. object data types and missing values are represented by
  53. StataMissingValue objects.
  54. preserve_dtypes : boolean, defaults to True
  55. Preserve Stata datatypes. If False, numeric data are upcast to pandas
  56. default types for foreign data (float64 or int64).
  57. columns : list or None
  58. Columns to retain. Columns will be returned in the given order. None
  59. returns all columns.
  60. order_categoricals : boolean, defaults to True
  61. Flag indicating whether converted categorical data are ordered."""
  62. _chunksize_params = """\
  63. chunksize : int, default None
  64. Return StataReader object for iterations, returns chunks with
  65. given number of lines."""
  66. _iterator_params = """\
  67. iterator : boolean, default False
  68. Return StataReader object."""
  69. _read_stata_doc = """
  70. Read Stata file into DataFrame.
  71. Parameters
  72. ----------
  73. filepath_or_buffer : string or file-like object
  74. Path to .dta file or object implementing a binary read() functions.
  75. %s
  76. %s
  77. %s
  78. %s
  79. %s
  80. Returns
  81. -------
  82. DataFrame or StataReader
  83. See Also
  84. --------
  85. pandas.io.stata.StataReader : Low-level reader for Stata data files.
  86. pandas.DataFrame.to_stata: Export Stata data files.
  87. Examples
  88. --------
  89. Read a Stata dta file:
  90. >>> df = pd.read_stata('filename.dta')
  91. Read a Stata dta file in 10,000 line chunks:
  92. >>> itr = pd.read_stata('filename.dta', chunksize=10000)
  93. >>> for chunk in itr:
  94. ... do_something(chunk)
  95. """ % (_statafile_processing_params1, _encoding_params,
  96. _statafile_processing_params2, _chunksize_params,
  97. _iterator_params)
  98. _data_method_doc = """\
  99. Reads observations from Stata file, converting them into a dataframe
  100. .. deprecated::
  101. This is a legacy method. Use `read` in new code.
  102. Parameters
  103. ----------
  104. %s
  105. %s
  106. Returns
  107. -------
  108. DataFrame
  109. """ % (_statafile_processing_params1, _statafile_processing_params2)
  110. _read_method_doc = """\
  111. Reads observations from Stata file, converting them into a dataframe
  112. Parameters
  113. ----------
  114. nrows : int
  115. Number of lines to read from data file, if None read whole file.
  116. %s
  117. %s
  118. Returns
  119. -------
  120. DataFrame
  121. """ % (_statafile_processing_params1, _statafile_processing_params2)
  122. _stata_reader_doc = """\
  123. Class for reading Stata dta files.
  124. Parameters
  125. ----------
  126. path_or_buf : path (string), buffer or path object
  127. string, path object (pathlib.Path or py._path.local.LocalPath) or object
  128. implementing a binary read() functions.
  129. .. versionadded:: 0.23.0 support for pathlib, py.path.
  130. %s
  131. %s
  132. %s
  133. %s
  134. """ % (_statafile_processing_params1, _statafile_processing_params2,
  135. _encoding_params, _chunksize_params)
  136. @Appender(_read_stata_doc)
  137. @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
  138. @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
  139. def read_stata(filepath_or_buffer, convert_dates=True,
  140. convert_categoricals=True, encoding=None, index_col=None,
  141. convert_missing=False, preserve_dtypes=True, columns=None,
  142. order_categoricals=True, chunksize=None, iterator=False):
  143. reader = StataReader(filepath_or_buffer,
  144. convert_dates=convert_dates,
  145. convert_categoricals=convert_categoricals,
  146. index_col=index_col, convert_missing=convert_missing,
  147. preserve_dtypes=preserve_dtypes,
  148. columns=columns,
  149. order_categoricals=order_categoricals,
  150. chunksize=chunksize)
  151. if iterator or chunksize:
  152. data = reader
  153. else:
  154. try:
  155. data = reader.read()
  156. finally:
  157. reader.close()
  158. return data
  159. _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
  160. stata_epoch = datetime.datetime(1960, 1, 1)
  161. def _stata_elapsed_date_to_datetime_vec(dates, fmt):
  162. """
  163. Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime
  164. Parameters
  165. ----------
  166. dates : Series
  167. The Stata Internal Format date to convert to datetime according to fmt
  168. fmt : str
  169. The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
  170. Returns
  171. Returns
  172. -------
  173. converted : Series
  174. The converted dates
  175. Examples
  176. --------
  177. >>> dates = pd.Series([52])
  178. >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw")
  179. 0 1961-01-01
  180. dtype: datetime64[ns]
  181. Notes
  182. -----
  183. datetime/c - tc
  184. milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
  185. datetime/C - tC - NOT IMPLEMENTED
  186. milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
  187. date - td
  188. days since 01jan1960 (01jan1960 = 0)
  189. weekly date - tw
  190. weeks since 1960w1
  191. This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
  192. The datetime value is the start of the week in terms of days in the
  193. year, not ISO calendar weeks.
  194. monthly date - tm
  195. months since 1960m1
  196. quarterly date - tq
  197. quarters since 1960q1
  198. half-yearly date - th
  199. half-years since 1960h1 yearly
  200. date - ty
  201. years since 0000
  202. If you don't have pandas with datetime support, then you can't do
  203. milliseconds accurately.
  204. """
  205. MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year
  206. MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days
  207. MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days
  208. MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000
  209. MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000
  210. def convert_year_month_safe(year, month):
  211. """
  212. Convert year and month to datetimes, using pandas vectorized versions
  213. when the date range falls within the range supported by pandas.
  214. Otherwise it falls back to a slower but more robust method
  215. using datetime.
  216. """
  217. if year.max() < MAX_YEAR and year.min() > MIN_YEAR:
  218. return to_datetime(100 * year + month, format='%Y%m')
  219. else:
  220. index = getattr(year, 'index', None)
  221. return Series(
  222. [datetime.datetime(y, m, 1) for y, m in zip(year, month)],
  223. index=index)
  224. def convert_year_days_safe(year, days):
  225. """
  226. Converts year (e.g. 1999) and days since the start of the year to a
  227. datetime or datetime64 Series
  228. """
  229. if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR:
  230. return (to_datetime(year, format='%Y') +
  231. to_timedelta(days, unit='d'))
  232. else:
  233. index = getattr(year, 'index', None)
  234. value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d))
  235. for y, d in zip(year, days)]
  236. return Series(value, index=index)
  237. def convert_delta_safe(base, deltas, unit):
  238. """
  239. Convert base dates and deltas to datetimes, using pandas vectorized
  240. versions if the deltas satisfy restrictions required to be expressed
  241. as dates in pandas.
  242. """
  243. index = getattr(deltas, 'index', None)
  244. if unit == 'd':
  245. if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA:
  246. values = [base + relativedelta(days=int(d)) for d in deltas]
  247. return Series(values, index=index)
  248. elif unit == 'ms':
  249. if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA:
  250. values = [base + relativedelta(microseconds=(int(d) * 1000))
  251. for d in deltas]
  252. return Series(values, index=index)
  253. else:
  254. raise ValueError('format not understood')
  255. base = to_datetime(base)
  256. deltas = to_timedelta(deltas, unit=unit)
  257. return base + deltas
  258. # TODO: If/when pandas supports more than datetime64[ns], this should be
  259. # improved to use correct range, e.g. datetime[Y] for yearly
  260. bad_locs = np.isnan(dates)
  261. has_bad_values = False
  262. if bad_locs.any():
  263. has_bad_values = True
  264. data_col = Series(dates)
  265. data_col[bad_locs] = 1.0 # Replace with NaT
  266. dates = dates.astype(np.int64)
  267. if fmt.startswith(("%tc", "tc")): # Delta ms relative to base
  268. base = stata_epoch
  269. ms = dates
  270. conv_dates = convert_delta_safe(base, ms, 'ms')
  271. elif fmt.startswith(("%tC", "tC")):
  272. warnings.warn("Encountered %tC format. Leaving in Stata "
  273. "Internal Format.")
  274. conv_dates = Series(dates, dtype=np.object)
  275. if has_bad_values:
  276. conv_dates[bad_locs] = NaT
  277. return conv_dates
  278. # Delta days relative to base
  279. elif fmt.startswith(("%td", "td", "%d", "d")):
  280. base = stata_epoch
  281. days = dates
  282. conv_dates = convert_delta_safe(base, days, 'd')
  283. # does not count leap days - 7 days is a week.
  284. # 52nd week may have more than 7 days
  285. elif fmt.startswith(("%tw", "tw")):
  286. year = stata_epoch.year + dates // 52
  287. days = (dates % 52) * 7
  288. conv_dates = convert_year_days_safe(year, days)
  289. elif fmt.startswith(("%tm", "tm")): # Delta months relative to base
  290. year = stata_epoch.year + dates // 12
  291. month = (dates % 12) + 1
  292. conv_dates = convert_year_month_safe(year, month)
  293. elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base
  294. year = stata_epoch.year + dates // 4
  295. month = (dates % 4) * 3 + 1
  296. conv_dates = convert_year_month_safe(year, month)
  297. elif fmt.startswith(("%th", "th")): # Delta half-years relative to base
  298. year = stata_epoch.year + dates // 2
  299. month = (dates % 2) * 6 + 1
  300. conv_dates = convert_year_month_safe(year, month)
  301. elif fmt.startswith(("%ty", "ty")): # Years -- not delta
  302. year = dates
  303. month = np.ones_like(dates)
  304. conv_dates = convert_year_month_safe(year, month)
  305. else:
  306. raise ValueError("Date fmt {fmt} not understood".format(fmt=fmt))
  307. if has_bad_values: # Restore NaT for bad values
  308. conv_dates[bad_locs] = NaT
  309. return conv_dates
  310. def _datetime_to_stata_elapsed_vec(dates, fmt):
  311. """
  312. Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime
  313. Parameters
  314. ----------
  315. dates : Series
  316. Series or array containing datetime.datetime or datetime64[ns] to
  317. convert to the Stata Internal Format given by fmt
  318. fmt : str
  319. The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
  320. """
  321. index = dates.index
  322. NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
  323. US_PER_DAY = NS_PER_DAY / 1000
  324. def parse_dates_safe(dates, delta=False, year=False, days=False):
  325. d = {}
  326. if is_datetime64_dtype(dates.values):
  327. if delta:
  328. delta = dates - stata_epoch
  329. d['delta'] = delta.values.astype(
  330. np.int64) // 1000 # microseconds
  331. if days or year:
  332. dates = DatetimeIndex(dates)
  333. d['year'], d['month'] = dates.year, dates.month
  334. if days:
  335. days = (dates.astype(np.int64) -
  336. to_datetime(d['year'], format='%Y').astype(np.int64))
  337. d['days'] = days // NS_PER_DAY
  338. elif infer_dtype(dates, skipna=False) == 'datetime':
  339. if delta:
  340. delta = dates.values - stata_epoch
  341. f = lambda x: \
  342. US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds
  343. v = np.vectorize(f)
  344. d['delta'] = v(delta)
  345. if year:
  346. year_month = dates.apply(lambda x: 100 * x.year + x.month)
  347. d['year'] = year_month.values // 100
  348. d['month'] = (year_month.values - d['year'] * 100)
  349. if days:
  350. f = lambda x: (x - datetime.datetime(x.year, 1, 1)).days
  351. v = np.vectorize(f)
  352. d['days'] = v(dates)
  353. else:
  354. raise ValueError('Columns containing dates must contain either '
  355. 'datetime64, datetime.datetime or null values.')
  356. return DataFrame(d, index=index)
  357. bad_loc = isna(dates)
  358. index = dates.index
  359. if bad_loc.any():
  360. dates = Series(dates)
  361. if is_datetime64_dtype(dates):
  362. dates[bad_loc] = to_datetime(stata_epoch)
  363. else:
  364. dates[bad_loc] = stata_epoch
  365. if fmt in ["%tc", "tc"]:
  366. d = parse_dates_safe(dates, delta=True)
  367. conv_dates = d.delta / 1000
  368. elif fmt in ["%tC", "tC"]:
  369. warnings.warn("Stata Internal Format tC not supported.")
  370. conv_dates = dates
  371. elif fmt in ["%td", "td"]:
  372. d = parse_dates_safe(dates, delta=True)
  373. conv_dates = d.delta // US_PER_DAY
  374. elif fmt in ["%tw", "tw"]:
  375. d = parse_dates_safe(dates, year=True, days=True)
  376. conv_dates = (52 * (d.year - stata_epoch.year) + d.days // 7)
  377. elif fmt in ["%tm", "tm"]:
  378. d = parse_dates_safe(dates, year=True)
  379. conv_dates = (12 * (d.year - stata_epoch.year) + d.month - 1)
  380. elif fmt in ["%tq", "tq"]:
  381. d = parse_dates_safe(dates, year=True)
  382. conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
  383. elif fmt in ["%th", "th"]:
  384. d = parse_dates_safe(dates, year=True)
  385. conv_dates = (2 * (d.year - stata_epoch.year) +
  386. (d.month > 6).astype(np.int))
  387. elif fmt in ["%ty", "ty"]:
  388. d = parse_dates_safe(dates, year=True)
  389. conv_dates = d.year
  390. else:
  391. raise ValueError(
  392. "Format {fmt} is not a known Stata date format".format(fmt=fmt))
  393. conv_dates = Series(conv_dates, dtype=np.float64)
  394. missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
  395. conv_dates[bad_loc] = missing_value
  396. return Series(conv_dates, index=index)
  397. excessive_string_length_error = """
  398. Fixed width strings in Stata .dta files are limited to 244 (or fewer)
  399. characters. Column '%s' does not satisfy this restriction. Use the
  400. 'version=117' parameter to write the newer (Stata 13 and later) format.
  401. """
  402. class PossiblePrecisionLoss(Warning):
  403. pass
  404. precision_loss_doc = """
  405. Column converted from %s to %s, and some data are outside of the lossless
  406. conversion range. This may result in a loss of precision in the saved data.
  407. """
  408. class ValueLabelTypeMismatch(Warning):
  409. pass
  410. value_label_mismatch_doc = """
  411. Stata value labels (pandas categories) must be strings. Column {0} contains
  412. non-string labels which will be converted to strings. Please check that the
  413. Stata data file created has not lost information due to duplicate labels.
  414. """
  415. class InvalidColumnName(Warning):
  416. pass
  417. invalid_name_doc = """
  418. Not all pandas column names were valid Stata variable names.
  419. The following replacements have been made:
  420. {0}
  421. If this is not what you expect, please make sure you have Stata-compliant
  422. column names in your DataFrame (strings only, max 32 characters, only
  423. alphanumerics and underscores, no Stata reserved words)
  424. """
  425. def _cast_to_stata_types(data):
  426. """Checks the dtypes of the columns of a pandas DataFrame for
  427. compatibility with the data types and ranges supported by Stata, and
  428. converts if necessary.
  429. Parameters
  430. ----------
  431. data : DataFrame
  432. The DataFrame to check and convert
  433. Notes
  434. -----
  435. Numeric columns in Stata must be one of int8, int16, int32, float32 or
  436. float64, with some additional value restrictions. int8 and int16 columns
  437. are checked for violations of the value restrictions and upcast if needed.
  438. int64 data is not usable in Stata, and so it is downcast to int32 whenever
  439. the value are in the int32 range, and sidecast to float64 when larger than
  440. this range. If the int64 values are outside of the range of those
  441. perfectly representable as float64 values, a warning is raised.
  442. bool columns are cast to int8. uint columns are converted to int of the
  443. same size if there is no loss in precision, otherwise are upcast to a
  444. larger type. uint64 is currently not supported since it is concerted to
  445. object in a DataFrame.
  446. """
  447. ws = ''
  448. # original, if small, if large
  449. conversion_data = ((np.bool, np.int8, np.int8),
  450. (np.uint8, np.int8, np.int16),
  451. (np.uint16, np.int16, np.int32),
  452. (np.uint32, np.int32, np.int64))
  453. float32_max = struct.unpack('<f', b'\xff\xff\xff\x7e')[0]
  454. float64_max = struct.unpack('<d', b'\xff\xff\xff\xff\xff\xff\xdf\x7f')[0]
  455. for col in data:
  456. dtype = data[col].dtype
  457. # Cast from unsupported types to supported types
  458. for c_data in conversion_data:
  459. if dtype == c_data[0]:
  460. if data[col].max() <= np.iinfo(c_data[1]).max:
  461. dtype = c_data[1]
  462. else:
  463. dtype = c_data[2]
  464. if c_data[2] == np.float64: # Warn if necessary
  465. if data[col].max() >= 2 ** 53:
  466. ws = precision_loss_doc % ('uint64', 'float64')
  467. data[col] = data[col].astype(dtype)
  468. # Check values and upcast if necessary
  469. if dtype == np.int8:
  470. if data[col].max() > 100 or data[col].min() < -127:
  471. data[col] = data[col].astype(np.int16)
  472. elif dtype == np.int16:
  473. if data[col].max() > 32740 or data[col].min() < -32767:
  474. data[col] = data[col].astype(np.int32)
  475. elif dtype == np.int64:
  476. if (data[col].max() <= 2147483620 and
  477. data[col].min() >= -2147483647):
  478. data[col] = data[col].astype(np.int32)
  479. else:
  480. data[col] = data[col].astype(np.float64)
  481. if data[col].max() >= 2 ** 53 or data[col].min() <= -2 ** 53:
  482. ws = precision_loss_doc % ('int64', 'float64')
  483. elif dtype in (np.float32, np.float64):
  484. value = data[col].max()
  485. if np.isinf(value):
  486. raise ValueError('Column {col} has a maximum value of '
  487. 'infinity which is outside the range '
  488. 'supported by Stata.'.format(col=col))
  489. if dtype == np.float32 and value > float32_max:
  490. data[col] = data[col].astype(np.float64)
  491. elif dtype == np.float64:
  492. if value > float64_max:
  493. raise ValueError('Column {col} has a maximum value '
  494. '({val}) outside the range supported by '
  495. 'Stata ({float64_max})'
  496. .format(col=col, val=value,
  497. float64_max=float64_max))
  498. if ws:
  499. warnings.warn(ws, PossiblePrecisionLoss)
  500. return data
  501. class StataValueLabel(object):
  502. """
  503. Parse a categorical column and prepare formatted output
  504. Parameters
  505. -----------
  506. value : int8, int16, int32, float32 or float64
  507. The Stata missing value code
  508. Attributes
  509. ----------
  510. string : string
  511. String representation of the Stata missing value
  512. value : int8, int16, int32, float32 or float64
  513. The original encoded missing value
  514. Methods
  515. -------
  516. generate_value_label
  517. """
  518. def __init__(self, catarray):
  519. self.labname = catarray.name
  520. categories = catarray.cat.categories
  521. self.value_labels = list(zip(np.arange(len(categories)), categories))
  522. self.value_labels.sort(key=lambda x: x[0])
  523. self.text_len = np.int32(0)
  524. self.off = []
  525. self.val = []
  526. self.txt = []
  527. self.n = 0
  528. # Compute lengths and setup lists of offsets and labels
  529. for vl in self.value_labels:
  530. category = vl[1]
  531. if not isinstance(category, string_types):
  532. category = str(category)
  533. warnings.warn(value_label_mismatch_doc.format(catarray.name),
  534. ValueLabelTypeMismatch)
  535. self.off.append(self.text_len)
  536. self.text_len += len(category) + 1 # +1 for the padding
  537. self.val.append(vl[0])
  538. self.txt.append(category)
  539. self.n += 1
  540. if self.text_len > 32000:
  541. raise ValueError('Stata value labels for a single variable must '
  542. 'have a combined length less than 32,000 '
  543. 'characters.')
  544. # Ensure int32
  545. self.off = np.array(self.off, dtype=np.int32)
  546. self.val = np.array(self.val, dtype=np.int32)
  547. # Total length
  548. self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
  549. def _encode(self, s):
  550. """
  551. Python 3 compatibility shim
  552. """
  553. if compat.PY3:
  554. return s.encode(self._encoding)
  555. else:
  556. return s
  557. def generate_value_label(self, byteorder, encoding):
  558. """
  559. Parameters
  560. ----------
  561. byteorder : str
  562. Byte order of the output
  563. encoding : str
  564. File encoding
  565. Returns
  566. -------
  567. value_label : bytes
  568. Bytes containing the formatted value label
  569. """
  570. self._encoding = encoding
  571. bio = BytesIO()
  572. null_string = '\x00'
  573. null_byte = b'\x00'
  574. # len
  575. bio.write(struct.pack(byteorder + 'i', self.len))
  576. # labname
  577. labname = self._encode(_pad_bytes(self.labname[:32], 33))
  578. bio.write(labname)
  579. # padding - 3 bytes
  580. for i in range(3):
  581. bio.write(struct.pack('c', null_byte))
  582. # value_label_table
  583. # n - int32
  584. bio.write(struct.pack(byteorder + 'i', self.n))
  585. # textlen - int32
  586. bio.write(struct.pack(byteorder + 'i', self.text_len))
  587. # off - int32 array (n elements)
  588. for offset in self.off:
  589. bio.write(struct.pack(byteorder + 'i', offset))
  590. # val - int32 array (n elements)
  591. for value in self.val:
  592. bio.write(struct.pack(byteorder + 'i', value))
  593. # txt - Text labels, null terminated
  594. for text in self.txt:
  595. bio.write(self._encode(text + null_string))
  596. bio.seek(0)
  597. return bio.read()
  598. class StataMissingValue(StringMixin):
  599. """
  600. An observation's missing value.
  601. Parameters
  602. -----------
  603. value : int8, int16, int32, float32 or float64
  604. The Stata missing value code
  605. Attributes
  606. ----------
  607. string : string
  608. String representation of the Stata missing value
  609. value : int8, int16, int32, float32 or float64
  610. The original encoded missing value
  611. Notes
  612. -----
  613. More information: <http://www.stata.com/help.cgi?missing>
  614. Integer missing values make the code '.', '.a', ..., '.z' to the ranges
  615. 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ...
  616. 2147483647 (for int32). Missing values for floating point data types are
  617. more complex but the pattern is simple to discern from the following table.
  618. np.float32 missing values (float in Stata)
  619. 0000007f .
  620. 0008007f .a
  621. 0010007f .b
  622. ...
  623. 00c0007f .x
  624. 00c8007f .y
  625. 00d0007f .z
  626. np.float64 missing values (double in Stata)
  627. 000000000000e07f .
  628. 000000000001e07f .a
  629. 000000000002e07f .b
  630. ...
  631. 000000000018e07f .x
  632. 000000000019e07f .y
  633. 00000000001ae07f .z
  634. """
  635. # Construct a dictionary of missing values
  636. MISSING_VALUES = {}
  637. bases = (101, 32741, 2147483621)
  638. for b in bases:
  639. # Conversion to long to avoid hash issues on 32 bit platforms #8968
  640. MISSING_VALUES[compat.long(b)] = '.'
  641. for i in range(1, 27):
  642. MISSING_VALUES[compat.long(i + b)] = '.' + chr(96 + i)
  643. float32_base = b'\x00\x00\x00\x7f'
  644. increment = struct.unpack('<i', b'\x00\x08\x00\x00')[0]
  645. for i in range(27):
  646. value = struct.unpack('<f', float32_base)[0]
  647. MISSING_VALUES[value] = '.'
  648. if i > 0:
  649. MISSING_VALUES[value] += chr(96 + i)
  650. int_value = struct.unpack('<i', struct.pack('<f', value))[
  651. 0] + increment
  652. float32_base = struct.pack('<i', int_value)
  653. float64_base = b'\x00\x00\x00\x00\x00\x00\xe0\x7f'
  654. increment = struct.unpack('q', b'\x00\x00\x00\x00\x00\x01\x00\x00')[0]
  655. for i in range(27):
  656. value = struct.unpack('<d', float64_base)[0]
  657. MISSING_VALUES[value] = '.'
  658. if i > 0:
  659. MISSING_VALUES[value] += chr(96 + i)
  660. int_value = struct.unpack('q', struct.pack('<d', value))[0] + increment
  661. float64_base = struct.pack('q', int_value)
  662. BASE_MISSING_VALUES = {'int8': 101,
  663. 'int16': 32741,
  664. 'int32': 2147483621,
  665. 'float32': struct.unpack('<f', float32_base)[0],
  666. 'float64': struct.unpack('<d', float64_base)[0]}
  667. def __init__(self, value):
  668. self._value = value
  669. # Conversion to long to avoid hash issues on 32 bit platforms #8968
  670. value = compat.long(value) if value < 2147483648 else float(value)
  671. self._str = self.MISSING_VALUES[value]
  672. string = property(lambda self: self._str,
  673. doc="The Stata representation of the missing value: "
  674. "'.', '.a'..'.z'")
  675. value = property(lambda self: self._value,
  676. doc='The binary representation of the missing value.')
  677. def __unicode__(self):
  678. return self.string
  679. def __repr__(self):
  680. # not perfect :-/
  681. return "{cls}({obj})".format(cls=self.__class__, obj=self)
  682. def __eq__(self, other):
  683. return (isinstance(other, self.__class__) and
  684. self.string == other.string and self.value == other.value)
  685. @classmethod
  686. def get_base_missing_value(cls, dtype):
  687. if dtype == np.int8:
  688. value = cls.BASE_MISSING_VALUES['int8']
  689. elif dtype == np.int16:
  690. value = cls.BASE_MISSING_VALUES['int16']
  691. elif dtype == np.int32:
  692. value = cls.BASE_MISSING_VALUES['int32']
  693. elif dtype == np.float32:
  694. value = cls.BASE_MISSING_VALUES['float32']
  695. elif dtype == np.float64:
  696. value = cls.BASE_MISSING_VALUES['float64']
  697. else:
  698. raise ValueError('Unsupported dtype')
  699. return value
  700. class StataParser(object):
  701. def __init__(self):
  702. # type code.
  703. # --------------------
  704. # str1 1 = 0x01
  705. # str2 2 = 0x02
  706. # ...
  707. # str244 244 = 0xf4
  708. # byte 251 = 0xfb (sic)
  709. # int 252 = 0xfc
  710. # long 253 = 0xfd
  711. # float 254 = 0xfe
  712. # double 255 = 0xff
  713. # --------------------
  714. # NOTE: the byte type seems to be reserved for categorical variables
  715. # with a label, but the underlying variable is -127 to 100
  716. # we're going to drop the label and cast to int
  717. self.DTYPE_MAP = \
  718. dict(
  719. lzip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) +
  720. [
  721. (251, np.int8),
  722. (252, np.int16),
  723. (253, np.int32),
  724. (254, np.float32),
  725. (255, np.float64)
  726. ]
  727. )
  728. self.DTYPE_MAP_XML = \
  729. dict(
  730. [
  731. (32768, np.uint8), # Keys to GSO
  732. (65526, np.float64),
  733. (65527, np.float32),
  734. (65528, np.int32),
  735. (65529, np.int16),
  736. (65530, np.int8)
  737. ]
  738. )
  739. self.TYPE_MAP = lrange(251) + list('bhlfd')
  740. self.TYPE_MAP_XML = \
  741. dict(
  742. [
  743. # Not really a Q, unclear how to handle byteswap
  744. (32768, 'Q'),
  745. (65526, 'd'),
  746. (65527, 'f'),
  747. (65528, 'l'),
  748. (65529, 'h'),
  749. (65530, 'b')
  750. ]
  751. )
  752. # NOTE: technically, some of these are wrong. there are more numbers
  753. # that can be represented. it's the 27 ABOVE and BELOW the max listed
  754. # numeric data type in [U] 12.2.2 of the 11.2 manual
  755. float32_min = b'\xff\xff\xff\xfe'
  756. float32_max = b'\xff\xff\xff\x7e'
  757. float64_min = b'\xff\xff\xff\xff\xff\xff\xef\xff'
  758. float64_max = b'\xff\xff\xff\xff\xff\xff\xdf\x7f'
  759. self.VALID_RANGE = {
  760. 'b': (-127, 100),
  761. 'h': (-32767, 32740),
  762. 'l': (-2147483647, 2147483620),
  763. 'f': (np.float32(struct.unpack('<f', float32_min)[0]),
  764. np.float32(struct.unpack('<f', float32_max)[0])),
  765. 'd': (np.float64(struct.unpack('<d', float64_min)[0]),
  766. np.float64(struct.unpack('<d', float64_max)[0]))
  767. }
  768. self.OLD_TYPE_MAPPING = {
  769. 98: 251, # byte
  770. 105: 252, # int
  771. 108: 253, # long
  772. 102: 254 # float
  773. # don't know old code for double
  774. }
  775. # These missing values are the generic '.' in Stata, and are used
  776. # to replace nans
  777. self.MISSING_VALUES = {
  778. 'b': 101,
  779. 'h': 32741,
  780. 'l': 2147483621,
  781. 'f': np.float32(struct.unpack('<f', b'\x00\x00\x00\x7f')[0]),
  782. 'd': np.float64(
  783. struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
  784. }
  785. self.NUMPY_TYPE_MAP = {
  786. 'b': 'i1',
  787. 'h': 'i2',
  788. 'l': 'i4',
  789. 'f': 'f4',
  790. 'd': 'f8',
  791. 'Q': 'u8'
  792. }
  793. # Reserved words cannot be used as variable names
  794. self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
  795. 'byte', 'case', 'catch', 'class', 'colvector',
  796. 'complex', 'const', 'continue', 'default',
  797. 'delegate', 'delete', 'do', 'double', 'else',
  798. 'eltypedef', 'end', 'enum', 'explicit',
  799. 'export', 'external', 'float', 'for', 'friend',
  800. 'function', 'global', 'goto', 'if', 'inline',
  801. 'int', 'local', 'long', 'NULL', 'pragma',
  802. 'protected', 'quad', 'rowvector', 'short',
  803. 'typedef', 'typename', 'virtual')
  804. class StataReader(StataParser, BaseIterator):
  805. __doc__ = _stata_reader_doc
  806. @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
  807. @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
  808. def __init__(self, path_or_buf, convert_dates=True,
  809. convert_categoricals=True, index_col=None,
  810. convert_missing=False, preserve_dtypes=True,
  811. columns=None, order_categoricals=True,
  812. encoding=None, chunksize=None):
  813. super(StataReader, self).__init__()
  814. self.col_sizes = ()
  815. # Arguments to the reader (can be temporarily overridden in
  816. # calls to read).
  817. self._convert_dates = convert_dates
  818. self._convert_categoricals = convert_categoricals
  819. self._index_col = index_col
  820. self._convert_missing = convert_missing
  821. self._preserve_dtypes = preserve_dtypes
  822. self._columns = columns
  823. self._order_categoricals = order_categoricals
  824. self._encoding = None
  825. self._chunksize = chunksize
  826. # State variables for the file
  827. self._has_string_data = False
  828. self._missing_values = False
  829. self._can_read_value_labels = False
  830. self._column_selector_set = False
  831. self._value_labels_read = False
  832. self._data_read = False
  833. self._dtype = None
  834. self._lines_read = 0
  835. self._native_byteorder = _set_endianness(sys.byteorder)
  836. path_or_buf = _stringify_path(path_or_buf)
  837. if isinstance(path_or_buf, str):
  838. path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
  839. path_or_buf)
  840. if isinstance(path_or_buf, (str, text_type, bytes)):
  841. self.path_or_buf = open(path_or_buf, 'rb')
  842. else:
  843. # Copy to BytesIO, and ensure no encoding
  844. contents = path_or_buf.read()
  845. self.path_or_buf = BytesIO(contents)
  846. self._read_header()
  847. self._setup_dtype()
  848. def __enter__(self):
  849. """ enter context manager """
  850. return self
  851. def __exit__(self, exc_type, exc_value, traceback):
  852. """ exit context manager """
  853. self.close()
  854. def close(self):
  855. """ close the handle if its open """
  856. try:
  857. self.path_or_buf.close()
  858. except IOError:
  859. pass
  860. def _set_encoding(self):
  861. """
  862. Set string encoding which depends on file version
  863. """
  864. if self.format_version < 118:
  865. self._encoding = 'latin-1'
  866. else:
  867. self._encoding = 'utf-8'
  868. def _read_header(self):
  869. first_char = self.path_or_buf.read(1)
  870. if struct.unpack('c', first_char)[0] == b'<':
  871. self._read_new_header(first_char)
  872. else:
  873. self._read_old_header(first_char)
  874. self.has_string_data = len([x for x in self.typlist
  875. if type(x) is int]) > 0
  876. # calculate size of a data record
  877. self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist)
  878. def _read_new_header(self, first_char):
  879. # The first part of the header is common to 117 and 118.
  880. self.path_or_buf.read(27) # stata_dta><header><release>
  881. self.format_version = int(self.path_or_buf.read(3))
  882. if self.format_version not in [117, 118]:
  883. raise ValueError(_version_error)
  884. self._set_encoding()
  885. self.path_or_buf.read(21) # </release><byteorder>
  886. self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
  887. self.path_or_buf.read(15) # </byteorder><K>
  888. self.nvar = struct.unpack(self.byteorder + 'H',
  889. self.path_or_buf.read(2))[0]
  890. self.path_or_buf.read(7) # </K><N>
  891. self.nobs = self._get_nobs()
  892. self.path_or_buf.read(11) # </N><label>
  893. self.data_label = self._get_data_label()
  894. self.path_or_buf.read(19) # </label><timestamp>
  895. self.time_stamp = self._get_time_stamp()
  896. self.path_or_buf.read(26) # </timestamp></header><map>
  897. self.path_or_buf.read(8) # 0x0000000000000000
  898. self.path_or_buf.read(8) # position of <map>
  899. self._seek_vartypes = struct.unpack(
  900. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 16
  901. self._seek_varnames = struct.unpack(
  902. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10
  903. self._seek_sortlist = struct.unpack(
  904. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 10
  905. self._seek_formats = struct.unpack(
  906. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 9
  907. self._seek_value_label_names = struct.unpack(
  908. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 19
  909. # Requires version-specific treatment
  910. self._seek_variable_labels = self._get_seek_variable_labels()
  911. self.path_or_buf.read(8) # <characteristics>
  912. self.data_location = struct.unpack(
  913. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 6
  914. self.seek_strls = struct.unpack(
  915. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 7
  916. self.seek_value_labels = struct.unpack(
  917. self.byteorder + 'q', self.path_or_buf.read(8))[0] + 14
  918. self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes)
  919. self.path_or_buf.seek(self._seek_varnames)
  920. self.varlist = self._get_varlist()
  921. self.path_or_buf.seek(self._seek_sortlist)
  922. self.srtlist = struct.unpack(
  923. self.byteorder + ('h' * (self.nvar + 1)),
  924. self.path_or_buf.read(2 * (self.nvar + 1))
  925. )[:-1]
  926. self.path_or_buf.seek(self._seek_formats)
  927. self.fmtlist = self._get_fmtlist()
  928. self.path_or_buf.seek(self._seek_value_label_names)
  929. self.lbllist = self._get_lbllist()
  930. self.path_or_buf.seek(self._seek_variable_labels)
  931. self._variable_labels = self._get_variable_labels()
  932. # Get data type information, works for versions 117-118.
  933. def _get_dtypes(self, seek_vartypes):
  934. self.path_or_buf.seek(seek_vartypes)
  935. raw_typlist = [struct.unpack(self.byteorder + 'H',
  936. self.path_or_buf.read(2))[0]
  937. for i in range(self.nvar)]
  938. def f(typ):
  939. if typ <= 2045:
  940. return typ
  941. try:
  942. return self.TYPE_MAP_XML[typ]
  943. except KeyError:
  944. raise ValueError("cannot convert stata types [{0}]".
  945. format(typ))
  946. typlist = [f(x) for x in raw_typlist]
  947. def f(typ):
  948. if typ <= 2045:
  949. return str(typ)
  950. try:
  951. return self.DTYPE_MAP_XML[typ]
  952. except KeyError:
  953. raise ValueError("cannot convert stata dtype [{0}]"
  954. .format(typ))
  955. dtyplist = [f(x) for x in raw_typlist]
  956. return typlist, dtyplist
  957. def _get_varlist(self):
  958. if self.format_version == 117:
  959. b = 33
  960. elif self.format_version == 118:
  961. b = 129
  962. return [self._null_terminate(self.path_or_buf.read(b))
  963. for i in range(self.nvar)]
  964. # Returns the format list
  965. def _get_fmtlist(self):
  966. if self.format_version == 118:
  967. b = 57
  968. elif self.format_version > 113:
  969. b = 49
  970. elif self.format_version > 104:
  971. b = 12
  972. else:
  973. b = 7
  974. return [self._null_terminate(self.path_or_buf.read(b))
  975. for i in range(self.nvar)]
  976. # Returns the label list
  977. def _get_lbllist(self):
  978. if self.format_version >= 118:
  979. b = 129
  980. elif self.format_version > 108:
  981. b = 33
  982. else:
  983. b = 9
  984. return [self._null_terminate(self.path_or_buf.read(b))
  985. for i in range(self.nvar)]
  986. def _get_variable_labels(self):
  987. if self.format_version == 118:
  988. vlblist = [self._decode(self.path_or_buf.read(321))
  989. for i in range(self.nvar)]
  990. elif self.format_version > 105:
  991. vlblist = [self._null_terminate(self.path_or_buf.read(81))
  992. for i in range(self.nvar)]
  993. else:
  994. vlblist = [self._null_terminate(self.path_or_buf.read(32))
  995. for i in range(self.nvar)]
  996. return vlblist
  997. def _get_nobs(self):
  998. if self.format_version == 118:
  999. return struct.unpack(self.byteorder + 'Q',
  1000. self.path_or_buf.read(8))[0]
  1001. else:
  1002. return struct.unpack(self.byteorder + 'I',
  1003. self.path_or_buf.read(4))[0]
  1004. def _get_data_label(self):
  1005. if self.format_version == 118:
  1006. strlen = struct.unpack(self.byteorder + 'H',
  1007. self.path_or_buf.read(2))[0]
  1008. return self._decode(self.path_or_buf.read(strlen))
  1009. elif self.format_version == 117:
  1010. strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
  1011. return self._null_terminate(self.path_or_buf.read(strlen))
  1012. elif self.format_version > 105:
  1013. return self._null_terminate(self.path_or_buf.read(81))
  1014. else:
  1015. return self._null_terminate(self.path_or_buf.read(32))
  1016. def _get_time_stamp(self):
  1017. if self.format_version == 118:
  1018. strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
  1019. return self.path_or_buf.read(strlen).decode("utf-8")
  1020. elif self.format_version == 117:
  1021. strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
  1022. return self._null_terminate(self.path_or_buf.read(strlen))
  1023. elif self.format_version > 104:
  1024. return self._null_terminate(self.path_or_buf.read(18))
  1025. else:
  1026. raise ValueError()
  1027. def _get_seek_variable_labels(self):
  1028. if self.format_version == 117:
  1029. self.path_or_buf.read(8) # <variable_lables>, throw away
  1030. # Stata 117 data files do not follow the described format. This is
  1031. # a work around that uses the previous label, 33 bytes for each
  1032. # variable, 20 for the closing tag and 17 for the opening tag
  1033. return self._seek_value_label_names + (33 * self.nvar) + 20 + 17
  1034. elif self.format_version == 118:
  1035. return struct.unpack(self.byteorder + 'q',
  1036. self.path_or_buf.read(8))[0] + 17
  1037. else:
  1038. raise ValueError()
  1039. def _read_old_header(self, first_char):
  1040. self.format_version = struct.unpack('b', first_char)[0]
  1041. if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
  1042. raise ValueError(_version_error)
  1043. self._set_encoding()
  1044. self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
  1045. 0] == 0x1 and '>' or '<'
  1046. self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
  1047. self.path_or_buf.read(1) # unused
  1048. self.nvar = struct.unpack(self.byteorder + 'H',
  1049. self.path_or_buf.read(2))[0]
  1050. self.nobs = self._get_nobs()
  1051. self.data_label = self._get_data_label()
  1052. self.time_stamp = self._get_time_stamp()
  1053. # descriptors
  1054. if self.format_version > 108:
  1055. typlist = [ord(self.path_or_buf.read(1))
  1056. for i in range(self.nvar)]
  1057. else:
  1058. buf = self.path_or_buf.read(self.nvar)
  1059. typlistb = np.frombuffer(buf, dtype=np.uint8)
  1060. typlist = []
  1061. for tp in typlistb:
  1062. if tp in self.OLD_TYPE_MAPPING:
  1063. typlist.append(self.OLD_TYPE_MAPPING[tp])
  1064. else:
  1065. typlist.append(tp - 127) # py2 string, py3 bytes
  1066. try:
  1067. self.typlist = [self.TYPE_MAP[typ] for typ in typlist]
  1068. except ValueError:
  1069. raise ValueError("cannot convert stata types [{0}]"
  1070. .format(','.join(str(x) for x in typlist)))
  1071. try:
  1072. self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
  1073. except ValueError:
  1074. raise ValueError("cannot convert stata dtypes [{0}]"
  1075. .format(','.join(str(x) for x in typlist)))
  1076. if self.format_version > 108:
  1077. self.varlist = [self._null_terminate(self.path_or_buf.read(33))
  1078. for i in range(self.nvar)]
  1079. else:
  1080. self.varlist = [self._null_terminate(self.path_or_buf.read(9))
  1081. for i in range(self.nvar)]
  1082. self.srtlist = struct.unpack(
  1083. self.byteorder + ('h' * (self.nvar + 1)),
  1084. self.path_or_buf.read(2 * (self.nvar + 1))
  1085. )[:-1]
  1086. self.fmtlist = self._get_fmtlist()
  1087. self.lbllist = self._get_lbllist()
  1088. self._variable_labels = self._get_variable_labels()
  1089. # ignore expansion fields (Format 105 and later)
  1090. # When reading, read five bytes; the last four bytes now tell you
  1091. # the size of the next read, which you discard. You then continue
  1092. # like this until you read 5 bytes of zeros.
  1093. if self.format_version > 104:
  1094. while True:
  1095. data_type = struct.unpack(self.byteorder + 'b',
  1096. self.path_or_buf.read(1))[0]
  1097. if self.format_version > 108:
  1098. data_len = struct.unpack(self.byteorder + 'i',
  1099. self.path_or_buf.read(4))[0]
  1100. else:
  1101. data_len = struct.unpack(self.byteorder + 'h',
  1102. self.path_or_buf.read(2))[0]
  1103. if data_type == 0:
  1104. break
  1105. self.path_or_buf.read(data_len)
  1106. # necessary data to continue parsing
  1107. self.data_location = self.path_or_buf.tell()
  1108. def _setup_dtype(self):
  1109. """Map between numpy and state dtypes"""
  1110. if self._dtype is not None:
  1111. return self._dtype
  1112. dtype = [] # Convert struct data types to numpy data type
  1113. for i, typ in enumerate(self.typlist):
  1114. if typ in self.NUMPY_TYPE_MAP:
  1115. dtype.append(('s' + str(i), self.byteorder +
  1116. self.NUMPY_TYPE_MAP[typ]))
  1117. else:
  1118. dtype.append(('s' + str(i), 'S' + str(typ)))
  1119. dtype = np.dtype(dtype)
  1120. self._dtype = dtype
  1121. return self._dtype
  1122. def _calcsize(self, fmt):
  1123. return (type(fmt) is int and fmt or
  1124. struct.calcsize(self.byteorder + fmt))
  1125. def _decode(self, s):
  1126. s = s.partition(b"\0")[0]
  1127. return s.decode('utf-8')
  1128. def _null_terminate(self, s):
  1129. # have bytes not strings, so must decode
  1130. s = s.partition(b"\0")[0]
  1131. return s.decode(self._encoding)
  1132. def _read_value_labels(self):
  1133. if self._value_labels_read:
  1134. # Don't read twice
  1135. return
  1136. if self.format_version <= 108:
  1137. # Value labels are not supported in version 108 and earlier.
  1138. self._value_labels_read = True
  1139. self.value_label_dict = dict()
  1140. return
  1141. if self.format_version >= 117:
  1142. self.path_or_buf.seek(self.seek_value_labels)
  1143. else:
  1144. offset = self.nobs * self._dtype.itemsize
  1145. self.path_or_buf.seek(self.data_location + offset)
  1146. self._value_labels_read = True
  1147. self.value_label_dict = dict()
  1148. while True:
  1149. if self.format_version >= 117:
  1150. if self.path_or_buf.read(5) == b'</val': # <lbl>
  1151. break # end of value label table
  1152. slength = self.path_or_buf.read(4)
  1153. if not slength:
  1154. break # end of value label table (format < 117)
  1155. if self.format_version <= 117:
  1156. labname = self._null_terminate(self.path_or_buf.read(33))
  1157. else:
  1158. labname = self._decode(self.path_or_buf.read(129))
  1159. self.path_or_buf.read(3) # padding
  1160. n = struct.unpack(self.byteorder + 'I',
  1161. self.path_or_buf.read(4))[0]
  1162. txtlen = struct.unpack(self.byteorder + 'I',
  1163. self.path_or_buf.read(4))[0]
  1164. off = np.frombuffer(self.path_or_buf.read(4 * n),
  1165. dtype=self.byteorder + "i4",
  1166. count=n)
  1167. val = np.frombuffer(self.path_or_buf.read(4 * n),
  1168. dtype=self.byteorder + "i4",
  1169. count=n)
  1170. ii = np.argsort(off)
  1171. off = off[ii]
  1172. val = val[ii]
  1173. txt = self.path_or_buf.read(txtlen)
  1174. self.value_label_dict[labname] = dict()
  1175. for i in range(n):
  1176. end = off[i + 1] if i < n - 1 else txtlen
  1177. if self.format_version <= 117:
  1178. self.value_label_dict[labname][val[i]] = (
  1179. self._null_terminate(txt[off[i]:end]))
  1180. else:
  1181. self.value_label_dict[labname][val[i]] = (
  1182. self._decode(txt[off[i]:end]))
  1183. if self.format_version >= 117:
  1184. self.path_or_buf.read(6) # </lbl>
  1185. self._value_labels_read = True
  1186. def _read_strls(self):
  1187. self.path_or_buf.seek(self.seek_strls)
  1188. # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
  1189. self.GSO = {'0': ''}
  1190. while True:
  1191. if self.path_or_buf.read(3) != b'GSO':
  1192. break
  1193. if self.format_version == 117:
  1194. v_o = struct.unpack(self.byteorder + 'Q',
  1195. self.path_or_buf.read(8))[0]
  1196. else:
  1197. buf = self.path_or_buf.read(12)
  1198. # Only tested on little endian file on little endian machine.
  1199. if self.byteorder == '<':
  1200. buf = buf[0:2] + buf[4:10]
  1201. else:
  1202. buf = buf[0:2] + buf[6:]
  1203. v_o = struct.unpack('Q', buf)[0]
  1204. typ = struct.unpack('B', self.path_or_buf.read(1))[0]
  1205. length = struct.unpack(self.byteorder + 'I',
  1206. self.path_or_buf.read(4))[0]
  1207. va = self.path_or_buf.read(length)
  1208. if typ == 130:
  1209. va = va[0:-1].decode(self._encoding)
  1210. # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
  1211. self.GSO[str(v_o)] = va
  1212. # legacy
  1213. @Appender(_data_method_doc)
  1214. def data(self, **kwargs):
  1215. warnings.warn("'data' is deprecated, use 'read' instead")
  1216. if self._data_read:
  1217. raise Exception("Data has already been read.")
  1218. self._data_read = True
  1219. return self.read(None, **kwargs)
  1220. def __next__(self):
  1221. return self.read(nrows=self._chunksize or 1)
  1222. def get_chunk(self, size=None):
  1223. """
  1224. Reads lines from Stata file and returns as dataframe
  1225. Parameters
  1226. ----------
  1227. size : int, defaults to None
  1228. Number of lines to read. If None, reads whole file.
  1229. Returns
  1230. -------
  1231. DataFrame
  1232. """
  1233. if size is None:
  1234. size = self._chunksize
  1235. return self.read(nrows=size)
  1236. @Appender(_read_method_doc)
  1237. @deprecate_kwarg(old_arg_name='index', new_arg_name='index_col')
  1238. def read(self, nrows=None, convert_dates=None,
  1239. convert_categoricals=None, index_col=None,
  1240. convert_missing=None, preserve_dtypes=None,
  1241. columns=None, order_categoricals=None):
  1242. # Handle empty file or chunk. If reading incrementally raise
  1243. # StopIteration. If reading the whole thing return an empty
  1244. # data frame.
  1245. if (self.nobs == 0) and (nrows is None):
  1246. self._can_read_value_labels = True
  1247. self._data_read = True
  1248. self.close()
  1249. return DataFrame(columns=self.varlist)
  1250. # Handle options
  1251. if convert_dates is None:
  1252. convert_dates = self._convert_dates
  1253. if convert_categoricals is None:
  1254. convert_categoricals = self._convert_categoricals
  1255. if convert_missing is None:
  1256. convert_missing = self._convert_missing
  1257. if preserve_dtypes is None:
  1258. preserve_dtypes = self._preserve_dtypes
  1259. if columns is None:
  1260. columns = self._columns
  1261. if order_categoricals is None:
  1262. order_categoricals = self._order_categoricals
  1263. if index_col is None:
  1264. index_col = self._index_col
  1265. if nrows is None:
  1266. nrows = self.nobs
  1267. if (self.format_version >= 117) and (not self._value_labels_read):
  1268. self._can_read_value_labels = True
  1269. self._read_strls()
  1270. # Read data
  1271. dtype = self._dtype
  1272. max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
  1273. read_len = nrows * dtype.itemsize
  1274. read_len = min(read_len, max_read_len)
  1275. if read_len <= 0:
  1276. # Iterator has finished, should never be here unless
  1277. # we are reading the file incrementally
  1278. if convert_categoricals:
  1279. self._read_value_labels()
  1280. self.close()
  1281. raise StopIteration
  1282. offset = self._lines_read * dtype.itemsize
  1283. self.path_or_buf.seek(self.data_location + offset)
  1284. read_lines = min(nrows, self.nobs - self._lines_read)
  1285. data = np.frombuffer(self.path_or_buf.read(read_len), dtype=dtype,
  1286. count=read_lines)
  1287. self._lines_read += read_lines
  1288. if self._lines_read == self.nobs:
  1289. self._can_read_value_labels = True
  1290. self._data_read = True
  1291. # if necessary, swap the byte order to native here
  1292. if self.byteorder != self._native_byteorder:
  1293. data = data.byteswap().newbyteorder()
  1294. if convert_categoricals:
  1295. self._read_value_labels()
  1296. if len(data) == 0:
  1297. data = DataFrame(columns=self.varlist)
  1298. else:
  1299. data = DataFrame.from_records(data)
  1300. data.columns = self.varlist
  1301. # If index is not specified, use actual row number rather than
  1302. # restarting at 0 for each chunk.
  1303. if index_col is None:
  1304. ix = np.arange(self._lines_read - read_lines, self._lines_read)
  1305. data = data.set_index(ix)
  1306. if columns is not None:
  1307. try:
  1308. data = self._do_select_columns(data, columns)
  1309. except ValueError:
  1310. self.close()
  1311. raise
  1312. # Decode strings
  1313. for col, typ in zip(data, self.typlist):
  1314. if type(typ) is int:
  1315. data[col] = data[col].apply(
  1316. self._null_terminate, convert_dtype=True)
  1317. data = self._insert_strls(data)
  1318. cols_ = np.where(self.dtyplist)[0]
  1319. # Convert columns (if needed) to match input type
  1320. ix = data.index
  1321. requires_type_conversion = False
  1322. data_formatted = []
  1323. for i in cols_:
  1324. if self.dtyplist[i] is not None:
  1325. col = data.columns[i]
  1326. dtype = data[col].dtype
  1327. if dtype != np.dtype(object) and dtype != self.dtyplist[i]:
  1328. requires_type_conversion = True
  1329. data_formatted.append(
  1330. (col, Series(data[col], ix, self.dtyplist[i])))
  1331. else:
  1332. data_formatted.append((col, data[col]))
  1333. if requires_type_conversion:
  1334. data = DataFrame.from_dict(OrderedDict(data_formatted))
  1335. del data_formatted
  1336. self._do_convert_missing(data, convert_missing)
  1337. if convert_dates:
  1338. cols = np.where(lmap(lambda x: any(x.startswith(fmt)
  1339. for fmt in _date_formats),
  1340. self.fmtlist))[0]
  1341. for i in cols:
  1342. col = data.columns[i]
  1343. try:
  1344. data[col] = _stata_elapsed_date_to_datetime_vec(
  1345. data[col],
  1346. self.fmtlist[i])
  1347. except ValueError:
  1348. self.close()
  1349. raise
  1350. if convert_categoricals and self.format_version > 108:
  1351. data = self._do_convert_categoricals(data,
  1352. self.value_label_dict,
  1353. self.lbllist,
  1354. order_categoricals)
  1355. if not preserve_dtypes:
  1356. retyped_data = []
  1357. convert = False
  1358. for col in data:
  1359. dtype = data[col].dtype
  1360. if dtype in (np.float16, np.float32):
  1361. dtype = np.float64
  1362. convert = True
  1363. elif dtype in (np.int8, np.int16, np.int32):
  1364. dtype = np.int64
  1365. convert = True
  1366. retyped_data.append((col, data[col].astype(dtype)))
  1367. if convert:
  1368. data = DataFrame.from_dict(OrderedDict(retyped_data))
  1369. if index_col is not None:
  1370. data = data.set_index(data.pop(index_col))
  1371. return data
  1372. def _do_convert_missing(self, data, convert_missing):
  1373. # Check for missing values, and replace if found
  1374. for i, colname in enumerate(data):
  1375. fmt = self.typlist[i]
  1376. if fmt not in self.VALID_RANGE:
  1377. continue
  1378. nmin, nmax = self.VALID_RANGE[fmt]
  1379. series = data[colname]
  1380. missing = np.logical_or(series < nmin, series > nmax)
  1381. if not missing.any():
  1382. continue
  1383. if convert_missing: # Replacement follows Stata notation
  1384. missing_loc = np.argwhere(missing._ndarray_values)
  1385. umissing, umissing_loc = np.unique(series[missing],
  1386. return_inverse=True)
  1387. replacement = Series(series, dtype=np.object)
  1388. for j, um in enumerate(umissing):
  1389. missing_value = StataMissingValue(um)
  1390. loc = missing_loc[umissing_loc == j]
  1391. replacement.iloc[loc] = missing_value
  1392. else: # All replacements are identical
  1393. dtype = series.dtype
  1394. if dtype not in (np.float32, np.float64):
  1395. dtype = np.float64
  1396. replacement = Series(series, dtype=dtype)
  1397. replacement[missing] = np.nan
  1398. data[colname] = replacement
  1399. def _insert_strls(self, data):
  1400. if not hasattr(self, 'GSO') or len(self.GSO) == 0:
  1401. return data
  1402. for i, typ in enumerate(self.typlist):
  1403. if typ != 'Q':
  1404. continue
  1405. # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
  1406. data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]]
  1407. return data
  1408. def _do_select_columns(self, data, columns):
  1409. if not self._column_selector_set:
  1410. column_set = set(columns)
  1411. if len(column_set) != len(columns):
  1412. raise ValueError('columns contains duplicate entries')
  1413. unmatched = column_set.difference(data.columns)
  1414. if unmatched:
  1415. raise ValueError('The following columns were not found in the '
  1416. 'Stata data set: ' +
  1417. ', '.join(list(unmatched)))
  1418. # Copy information for retained columns for later processing
  1419. dtyplist = []
  1420. typlist = []
  1421. fmtlist = []
  1422. lbllist = []
  1423. for col in columns:
  1424. i = data.columns.get_loc(col)
  1425. dtyplist.append(self.dtyplist[i])
  1426. typlist.append(self.typlist[i])
  1427. fmtlist.append(self.fmtlist[i])
  1428. lbllist.append(self.lbllist[i])
  1429. self.dtyplist = dtyplist
  1430. self.typlist = typlist
  1431. self.fmtlist = fmtlist
  1432. self.lbllist = lbllist
  1433. self._column_selector_set = True
  1434. return data[columns]
  1435. def _do_convert_categoricals(self, data, value_label_dict, lbllist,
  1436. order_categoricals):
  1437. """
  1438. Converts categorical columns to Categorical type.
  1439. """
  1440. value_labels = list(compat.iterkeys(value_label_dict))
  1441. cat_converted_data = []
  1442. for col, label in zip(data, lbllist):
  1443. if label in value_labels:
  1444. # Explicit call with ordered=True
  1445. cat_data = Categorical(data[col], ordered=order_categoricals)
  1446. categories = []
  1447. for category in cat_data.categories:
  1448. if category in value_label_dict[label]:
  1449. categories.append(value_label_dict[label][category])
  1450. else:
  1451. categories.append(category) # Partially labeled
  1452. try:
  1453. cat_data.categories = categories
  1454. except ValueError:
  1455. vc = Series(categories).value_counts()
  1456. repeats = list(vc.index[vc > 1])
  1457. repeats = '\n' + '-' * 80 + '\n'.join(repeats)
  1458. raise ValueError('Value labels for column {col} are not '
  1459. 'unique. The repeated labels are:\n'
  1460. '{repeats}'
  1461. .format(col=col, repeats=repeats))
  1462. # TODO: is the next line needed above in the data(...) method?
  1463. cat_data = Series(cat_data, index=data.index)
  1464. cat_converted_data.append((col, cat_data))
  1465. else:
  1466. cat_converted_data.append((col, data[col]))
  1467. data = DataFrame.from_dict(OrderedDict(cat_converted_data))
  1468. return data
  1469. def data_label(self):
  1470. """Returns data label of Stata file"""
  1471. return self.data_label
  1472. def variable_labels(self):
  1473. """Returns variable labels as a dict, associating each variable name
  1474. with corresponding label
  1475. """
  1476. return dict(zip(self.varlist, self._variable_labels))
  1477. def value_labels(self):
  1478. """Returns a dict, associating each variable name a dict, associating
  1479. each value its corresponding label
  1480. """
  1481. if not self._value_labels_read:
  1482. self._read_value_labels()
  1483. return self.value_label_dict
  1484. def _open_file_binary_write(fname):
  1485. """
  1486. Open a binary file or no-op if file-like
  1487. Parameters
  1488. ----------
  1489. fname : string path, path object or buffer
  1490. Returns
  1491. -------
  1492. file : file-like object
  1493. File object supporting write
  1494. own : bool
  1495. True if the file was created, otherwise False
  1496. """
  1497. if hasattr(fname, 'write'):
  1498. # if 'b' not in fname.mode:
  1499. return fname, False
  1500. return open(fname, "wb"), True
  1501. def _set_endianness(endianness):
  1502. if endianness.lower() in ["<", "little"]:
  1503. return "<"
  1504. elif endianness.lower() in [">", "big"]:
  1505. return ">"
  1506. else: # pragma : no cover
  1507. raise ValueError(
  1508. "Endianness {endian} not understood".format(endian=endianness))
  1509. def _pad_bytes(name, length):
  1510. """
  1511. Takes a char string and pads it with null bytes until it's length chars
  1512. """
  1513. return name + "\x00" * (length - len(name))
  1514. def _convert_datetime_to_stata_type(fmt):
  1515. """
  1516. Converts from one of the stata date formats to a type in TYPE_MAP
  1517. """
  1518. if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq",
  1519. "%tq", "th", "%th", "ty", "%ty"]:
  1520. return np.float64 # Stata expects doubles for SIFs
  1521. else:
  1522. raise NotImplementedError(
  1523. "Format {fmt} not implemented".format(fmt=fmt))
  1524. def _maybe_convert_to_int_keys(convert_dates, varlist):
  1525. new_dict = {}
  1526. for key in convert_dates:
  1527. if not convert_dates[key].startswith("%"): # make sure proper fmts
  1528. convert_dates[key] = "%" + convert_dates[key]
  1529. if key in varlist:
  1530. new_dict.update({varlist.index(key): convert_dates[key]})
  1531. else:
  1532. if not isinstance(key, int):
  1533. raise ValueError("convert_dates key must be a "
  1534. "column or an integer")
  1535. new_dict.update({key: convert_dates[key]})
  1536. return new_dict
  1537. def _dtype_to_stata_type(dtype, column):
  1538. """
  1539. Converts dtype types to stata types. Returns the byte of the given ordinal.
  1540. See TYPE_MAP and comments for an explanation. This is also explained in
  1541. the dta spec.
  1542. 1 - 244 are strings of this length
  1543. Pandas Stata
  1544. 251 - for int8 byte
  1545. 252 - for int16 int
  1546. 253 - for int32 long
  1547. 254 - for float32 float
  1548. 255 - for double double
  1549. If there are dates to convert, then dtype will already have the correct
  1550. type inserted.
  1551. """
  1552. # TODO: expand to handle datetime to integer conversion
  1553. if dtype.type == np.object_: # try to coerce it to the biggest string
  1554. # not memory efficient, what else could we
  1555. # do?
  1556. itemsize = max_len_string_array(ensure_object(column.values))
  1557. return max(itemsize, 1)
  1558. elif dtype == np.float64:
  1559. return 255
  1560. elif dtype == np.float32:
  1561. return 254
  1562. elif dtype == np.int32:
  1563. return 253
  1564. elif dtype == np.int16:
  1565. return 252
  1566. elif dtype == np.int8:
  1567. return 251
  1568. else: # pragma : no cover
  1569. raise NotImplementedError(
  1570. "Data type {dtype} not supported.".format(dtype=dtype))
  1571. def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
  1572. force_strl=False):
  1573. """
  1574. Maps numpy dtype to stata's default format for this type. Not terribly
  1575. important since users can change this in Stata. Semantics are
  1576. object -> "%DDs" where DD is the length of the string. If not a string,
  1577. raise ValueError
  1578. float64 -> "%10.0g"
  1579. float32 -> "%9.0g"
  1580. int64 -> "%9.0g"
  1581. int32 -> "%12.0g"
  1582. int16 -> "%8.0g"
  1583. int8 -> "%8.0g"
  1584. strl -> "%9s"
  1585. """
  1586. # TODO: Refactor to combine type with format
  1587. # TODO: expand this to handle a default datetime format?
  1588. if dta_version < 117:
  1589. max_str_len = 244
  1590. else:
  1591. max_str_len = 2045
  1592. if force_strl:
  1593. return '%9s'
  1594. if dtype.type == np.object_:
  1595. inferred_dtype = infer_dtype(column, skipna=True)
  1596. if not (inferred_dtype in ('string', 'unicode') or
  1597. len(column) == 0):
  1598. raise ValueError('Column `{col}` cannot be exported.\n\nOnly '
  1599. 'string-like object arrays containing all '
  1600. 'strings or a mix of strings and None can be '
  1601. 'exported. Object arrays containing only null '
  1602. 'values are prohibited. Other object types'
  1603. 'cannot be exported and must first be converted '
  1604. 'to one of the supported '
  1605. 'types.'.format(col=column.name))
  1606. itemsize = max_len_string_array(ensure_object(column.values))
  1607. if itemsize > max_str_len:
  1608. if dta_version >= 117:
  1609. return '%9s'
  1610. else:
  1611. raise ValueError(excessive_string_length_error % column.name)
  1612. return "%" + str(max(itemsize, 1)) + "s"
  1613. elif dtype == np.float64:
  1614. return "%10.0g"
  1615. elif dtype == np.float32:
  1616. return "%9.0g"
  1617. elif dtype == np.int32:
  1618. return "%12.0g"
  1619. elif dtype == np.int8 or dtype == np.int16:
  1620. return "%8.0g"
  1621. else: # pragma : no cover
  1622. raise NotImplementedError(
  1623. "Data type {dtype} not supported.".format(dtype=dtype))
  1624. class StataWriter(StataParser):
  1625. """
  1626. A class for writing Stata binary dta files
  1627. Parameters
  1628. ----------
  1629. fname : path (string), buffer or path object
  1630. string, path object (pathlib.Path or py._path.local.LocalPath) or
  1631. object implementing a binary write() functions. If using a buffer
  1632. then the buffer will not be automatically closed after the file
  1633. is written.
  1634. .. versionadded:: 0.23.0 support for pathlib, py.path.
  1635. data : DataFrame
  1636. Input to save
  1637. convert_dates : dict
  1638. Dictionary mapping columns containing datetime types to stata internal
  1639. format to use when writing the dates. Options are 'tc', 'td', 'tm',
  1640. 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
  1641. Datetime columns that do not have a conversion type specified will be
  1642. converted to 'tc'. Raises NotImplementedError if a datetime column has
  1643. timezone information
  1644. write_index : bool
  1645. Write the index to Stata dataset.
  1646. encoding : str
  1647. Default is latin-1. Only latin-1 and ascii are supported.
  1648. byteorder : str
  1649. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  1650. time_stamp : datetime
  1651. A datetime to use as file creation date. Default is the current time
  1652. data_label : str
  1653. A label for the data set. Must be 80 characters or smaller.
  1654. variable_labels : dict
  1655. Dictionary containing columns as keys and variable labels as values.
  1656. Each label must be 80 characters or smaller.
  1657. .. versionadded:: 0.19.0
  1658. Returns
  1659. -------
  1660. writer : StataWriter instance
  1661. The StataWriter instance has a write_file method, which will
  1662. write the file to the given `fname`.
  1663. Raises
  1664. ------
  1665. NotImplementedError
  1666. * If datetimes contain timezone information
  1667. ValueError
  1668. * Columns listed in convert_dates are neither datetime64[ns]
  1669. or datetime.datetime
  1670. * Column dtype is not representable in Stata
  1671. * Column listed in convert_dates is not in DataFrame
  1672. * Categorical label contains more than 32,000 characters
  1673. Examples
  1674. --------
  1675. >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b'])
  1676. >>> writer = StataWriter('./data_file.dta', data)
  1677. >>> writer.write_file()
  1678. Or with dates
  1679. >>> from datetime import datetime
  1680. >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date'])
  1681. >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'})
  1682. >>> writer.write_file()
  1683. """
  1684. _max_string_length = 244
  1685. @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
  1686. def __init__(self, fname, data, convert_dates=None, write_index=True,
  1687. encoding="latin-1", byteorder=None, time_stamp=None,
  1688. data_label=None, variable_labels=None):
  1689. super(StataWriter, self).__init__()
  1690. self._convert_dates = {} if convert_dates is None else convert_dates
  1691. self._write_index = write_index
  1692. self._encoding = 'latin-1'
  1693. self._time_stamp = time_stamp
  1694. self._data_label = data_label
  1695. self._variable_labels = variable_labels
  1696. self._own_file = True
  1697. # attach nobs, nvars, data, varlist, typlist
  1698. self._prepare_pandas(data)
  1699. if byteorder is None:
  1700. byteorder = sys.byteorder
  1701. self._byteorder = _set_endianness(byteorder)
  1702. self._fname = _stringify_path(fname)
  1703. self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
  1704. self._converted_names = {}
  1705. def _write(self, to_write):
  1706. """
  1707. Helper to call encode before writing to file for Python 3 compat.
  1708. """
  1709. if compat.PY3:
  1710. self._file.write(to_write.encode(self._encoding or
  1711. self._default_encoding))
  1712. else:
  1713. self._file.write(to_write)
  1714. def _prepare_categoricals(self, data):
  1715. """Check for categorical columns, retain categorical information for
  1716. Stata file and convert categorical data to int"""
  1717. is_cat = [is_categorical_dtype(data[col]) for col in data]
  1718. self._is_col_cat = is_cat
  1719. self._value_labels = []
  1720. if not any(is_cat):
  1721. return data
  1722. get_base_missing_value = StataMissingValue.get_base_missing_value
  1723. data_formatted = []
  1724. for col, col_is_cat in zip(data, is_cat):
  1725. if col_is_cat:
  1726. self._value_labels.append(StataValueLabel(data[col]))
  1727. dtype = data[col].cat.codes.dtype
  1728. if dtype == np.int64:
  1729. raise ValueError('It is not possible to export '
  1730. 'int64-based categorical data to Stata.')
  1731. values = data[col].cat.codes.values.copy()
  1732. # Upcast if needed so that correct missing values can be set
  1733. if values.max() >= get_base_missing_value(dtype):
  1734. if dtype == np.int8:
  1735. dtype = np.int16
  1736. elif dtype == np.int16:
  1737. dtype = np.int32
  1738. else:
  1739. dtype = np.float64
  1740. values = np.array(values, dtype=dtype)
  1741. # Replace missing values with Stata missing value for type
  1742. values[values == -1] = get_base_missing_value(dtype)
  1743. data_formatted.append((col, values))
  1744. else:
  1745. data_formatted.append((col, data[col]))
  1746. return DataFrame.from_dict(OrderedDict(data_formatted))
  1747. def _replace_nans(self, data):
  1748. # return data
  1749. """Checks floating point data columns for nans, and replaces these with
  1750. the generic Stata for missing value (.)"""
  1751. for c in data:
  1752. dtype = data[c].dtype
  1753. if dtype in (np.float32, np.float64):
  1754. if dtype == np.float32:
  1755. replacement = self.MISSING_VALUES['f']
  1756. else:
  1757. replacement = self.MISSING_VALUES['d']
  1758. data[c] = data[c].fillna(replacement)
  1759. return data
  1760. def _update_strl_names(self):
  1761. """No-op, forward compatibility"""
  1762. pass
  1763. def _check_column_names(self, data):
  1764. """
  1765. Checks column names to ensure that they are valid Stata column names.
  1766. This includes checks for:
  1767. * Non-string names
  1768. * Stata keywords
  1769. * Variables that start with numbers
  1770. * Variables with names that are too long
  1771. When an illegal variable name is detected, it is converted, and if
  1772. dates are exported, the variable name is propagated to the date
  1773. conversion dictionary
  1774. """
  1775. converted_names = {}
  1776. columns = list(data.columns)
  1777. original_columns = columns[:]
  1778. duplicate_var_id = 0
  1779. for j, name in enumerate(columns):
  1780. orig_name = name
  1781. if not isinstance(name, string_types):
  1782. name = text_type(name)
  1783. for c in name:
  1784. if ((c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and
  1785. (c < '0' or c > '9') and c != '_'):
  1786. name = name.replace(c, '_')
  1787. # Variable name must not be a reserved word
  1788. if name in self.RESERVED_WORDS:
  1789. name = '_' + name
  1790. # Variable name may not start with a number
  1791. if name[0] >= '0' and name[0] <= '9':
  1792. name = '_' + name
  1793. name = name[:min(len(name), 32)]
  1794. if not name == orig_name:
  1795. # check for duplicates
  1796. while columns.count(name) > 0:
  1797. # prepend ascending number to avoid duplicates
  1798. name = '_' + str(duplicate_var_id) + name
  1799. name = name[:min(len(name), 32)]
  1800. duplicate_var_id += 1
  1801. converted_names[orig_name] = name
  1802. columns[j] = name
  1803. data.columns = columns
  1804. # Check date conversion, and fix key if needed
  1805. if self._convert_dates:
  1806. for c, o in zip(columns, original_columns):
  1807. if c != o:
  1808. self._convert_dates[c] = self._convert_dates[o]
  1809. del self._convert_dates[o]
  1810. if converted_names:
  1811. conversion_warning = []
  1812. for orig_name, name in converted_names.items():
  1813. # need to possibly encode the orig name if its unicode
  1814. try:
  1815. orig_name = orig_name.encode('utf-8')
  1816. except (UnicodeDecodeError, AttributeError):
  1817. pass
  1818. msg = '{0} -> {1}'.format(orig_name, name)
  1819. conversion_warning.append(msg)
  1820. ws = invalid_name_doc.format('\n '.join(conversion_warning))
  1821. warnings.warn(ws, InvalidColumnName)
  1822. self._converted_names = converted_names
  1823. self._update_strl_names()
  1824. return data
  1825. def _set_formats_and_types(self, data, dtypes):
  1826. self.typlist = []
  1827. self.fmtlist = []
  1828. for col, dtype in dtypes.iteritems():
  1829. self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col]))
  1830. self.typlist.append(_dtype_to_stata_type(dtype, data[col]))
  1831. def _prepare_pandas(self, data):
  1832. # NOTE: we might need a different API / class for pandas objects so
  1833. # we can set different semantics - handle this with a PR to pandas.io
  1834. data = data.copy()
  1835. if self._write_index:
  1836. data = data.reset_index()
  1837. # Ensure column names are strings
  1838. data = self._check_column_names(data)
  1839. # Check columns for compatibility with stata, upcast if necessary
  1840. # Raise if outside the supported range
  1841. data = _cast_to_stata_types(data)
  1842. # Replace NaNs with Stata missing values
  1843. data = self._replace_nans(data)
  1844. # Convert categoricals to int data, and strip labels
  1845. data = self._prepare_categoricals(data)
  1846. self.nobs, self.nvar = data.shape
  1847. self.data = data
  1848. self.varlist = data.columns.tolist()
  1849. dtypes = data.dtypes
  1850. # Ensure all date columns are converted
  1851. for col in data:
  1852. if col in self._convert_dates:
  1853. continue
  1854. if is_datetime64_dtype(data[col]):
  1855. self._convert_dates[col] = 'tc'
  1856. self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates,
  1857. self.varlist)
  1858. for key in self._convert_dates:
  1859. new_type = _convert_datetime_to_stata_type(
  1860. self._convert_dates[key]
  1861. )
  1862. dtypes[key] = np.dtype(new_type)
  1863. self._set_formats_and_types(data, dtypes)
  1864. # set the given format for the datetime cols
  1865. if self._convert_dates is not None:
  1866. for key in self._convert_dates:
  1867. self.fmtlist[key] = self._convert_dates[key]
  1868. def write_file(self):
  1869. self._file, self._own_file = _open_file_binary_write(self._fname)
  1870. try:
  1871. self._write_header(time_stamp=self._time_stamp,
  1872. data_label=self._data_label)
  1873. self._write_map()
  1874. self._write_variable_types()
  1875. self._write_varnames()
  1876. self._write_sortlist()
  1877. self._write_formats()
  1878. self._write_value_label_names()
  1879. self._write_variable_labels()
  1880. self._write_expansion_fields()
  1881. self._write_characteristics()
  1882. self._prepare_data()
  1883. self._write_data()
  1884. self._write_strls()
  1885. self._write_value_labels()
  1886. self._write_file_close_tag()
  1887. self._write_map()
  1888. except Exception as exc:
  1889. self._close()
  1890. try:
  1891. if self._own_file:
  1892. os.unlink(self._fname)
  1893. except Exception:
  1894. warnings.warn('This save was not successful but {0} could not '
  1895. 'be deleted. This file is not '
  1896. 'valid.'.format(self._fname), ResourceWarning)
  1897. raise exc
  1898. else:
  1899. self._close()
  1900. def _close(self):
  1901. """
  1902. Close the file if it was created by the writer.
  1903. If a buffer or file-like object was passed in, for example a GzipFile,
  1904. then leave this file open for the caller to close. In either case,
  1905. attempt to flush the file contents to ensure they are written to disk
  1906. (if supported)
  1907. """
  1908. # Some file-like objects might not support flush
  1909. try:
  1910. self._file.flush()
  1911. except AttributeError:
  1912. pass
  1913. if self._own_file:
  1914. self._file.close()
  1915. def _write_map(self):
  1916. """No-op, future compatibility"""
  1917. pass
  1918. def _write_file_close_tag(self):
  1919. """No-op, future compatibility"""
  1920. pass
  1921. def _write_characteristics(self):
  1922. """No-op, future compatibility"""
  1923. pass
  1924. def _write_strls(self):
  1925. """No-op, future compatibility"""
  1926. pass
  1927. def _write_expansion_fields(self):
  1928. """Write 5 zeros for expansion fields"""
  1929. self._write(_pad_bytes("", 5))
  1930. def _write_value_labels(self):
  1931. for vl in self._value_labels:
  1932. self._file.write(vl.generate_value_label(self._byteorder,
  1933. self._encoding))
  1934. def _write_header(self, data_label=None, time_stamp=None):
  1935. byteorder = self._byteorder
  1936. # ds_format - just use 114
  1937. self._file.write(struct.pack("b", 114))
  1938. # byteorder
  1939. self._write(byteorder == ">" and "\x01" or "\x02")
  1940. # filetype
  1941. self._write("\x01")
  1942. # unused
  1943. self._write("\x00")
  1944. # number of vars, 2 bytes
  1945. self._file.write(struct.pack(byteorder + "h", self.nvar)[:2])
  1946. # number of obs, 4 bytes
  1947. self._file.write(struct.pack(byteorder + "i", self.nobs)[:4])
  1948. # data label 81 bytes, char, null terminated
  1949. if data_label is None:
  1950. self._file.write(self._null_terminate(_pad_bytes("", 80)))
  1951. else:
  1952. self._file.write(
  1953. self._null_terminate(_pad_bytes(data_label[:80], 80))
  1954. )
  1955. # time stamp, 18 bytes, char, null terminated
  1956. # format dd Mon yyyy hh:mm
  1957. if time_stamp is None:
  1958. time_stamp = datetime.datetime.now()
  1959. elif not isinstance(time_stamp, datetime.datetime):
  1960. raise ValueError("time_stamp should be datetime type")
  1961. # GH #13856
  1962. # Avoid locale-specific month conversion
  1963. months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
  1964. 'Sep', 'Oct', 'Nov', 'Dec']
  1965. month_lookup = {i + 1: month for i, month in enumerate(months)}
  1966. ts = (time_stamp.strftime("%d ") +
  1967. month_lookup[time_stamp.month] +
  1968. time_stamp.strftime(" %Y %H:%M"))
  1969. self._file.write(self._null_terminate(ts))
  1970. def _write_variable_types(self):
  1971. for typ in self.typlist:
  1972. self._file.write(struct.pack('B', typ))
  1973. def _write_varnames(self):
  1974. # varlist names are checked by _check_column_names
  1975. # varlist, requires null terminated
  1976. for name in self.varlist:
  1977. name = self._null_terminate(name, True)
  1978. name = _pad_bytes(name[:32], 33)
  1979. self._write(name)
  1980. def _write_sortlist(self):
  1981. # srtlist, 2*(nvar+1), int array, encoded by byteorder
  1982. srtlist = _pad_bytes("", 2 * (self.nvar + 1))
  1983. self._write(srtlist)
  1984. def _write_formats(self):
  1985. # fmtlist, 49*nvar, char array
  1986. for fmt in self.fmtlist:
  1987. self._write(_pad_bytes(fmt, 49))
  1988. def _write_value_label_names(self):
  1989. # lbllist, 33*nvar, char array
  1990. for i in range(self.nvar):
  1991. # Use variable name when categorical
  1992. if self._is_col_cat[i]:
  1993. name = self.varlist[i]
  1994. name = self._null_terminate(name, True)
  1995. name = _pad_bytes(name[:32], 33)
  1996. self._write(name)
  1997. else: # Default is empty label
  1998. self._write(_pad_bytes("", 33))
  1999. def _write_variable_labels(self):
  2000. # Missing labels are 80 blank characters plus null termination
  2001. blank = _pad_bytes('', 81)
  2002. if self._variable_labels is None:
  2003. for i in range(self.nvar):
  2004. self._write(blank)
  2005. return
  2006. for col in self.data:
  2007. if col in self._variable_labels:
  2008. label = self._variable_labels[col]
  2009. if len(label) > 80:
  2010. raise ValueError('Variable labels must be 80 characters '
  2011. 'or fewer')
  2012. is_latin1 = all(ord(c) < 256 for c in label)
  2013. if not is_latin1:
  2014. raise ValueError('Variable labels must contain only '
  2015. 'characters that can be encoded in '
  2016. 'Latin-1')
  2017. self._write(_pad_bytes(label, 81))
  2018. else:
  2019. self._write(blank)
  2020. def _convert_strls(self, data):
  2021. """No-op, future compatibility"""
  2022. return data
  2023. def _prepare_data(self):
  2024. data = self.data
  2025. typlist = self.typlist
  2026. convert_dates = self._convert_dates
  2027. # 1. Convert dates
  2028. if self._convert_dates is not None:
  2029. for i, col in enumerate(data):
  2030. if i in convert_dates:
  2031. data[col] = _datetime_to_stata_elapsed_vec(data[col],
  2032. self.fmtlist[i])
  2033. # 2. Convert strls
  2034. data = self._convert_strls(data)
  2035. # 3. Convert bad string data to '' and pad to correct length
  2036. dtypes = []
  2037. data_cols = []
  2038. has_strings = False
  2039. native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
  2040. for i, col in enumerate(data):
  2041. typ = typlist[i]
  2042. if typ <= self._max_string_length:
  2043. has_strings = True
  2044. data[col] = data[col].fillna('').apply(_pad_bytes, args=(typ,))
  2045. stype = 'S{type}'.format(type=typ)
  2046. dtypes.append(('c' + str(i), stype))
  2047. string = data[col].str.encode(self._encoding)
  2048. data_cols.append(string.values.astype(stype))
  2049. else:
  2050. values = data[col].values
  2051. dtype = data[col].dtype
  2052. if not native_byteorder:
  2053. dtype = dtype.newbyteorder(self._byteorder)
  2054. dtypes.append(('c' + str(i), dtype))
  2055. data_cols.append(values)
  2056. dtypes = np.dtype(dtypes)
  2057. if has_strings or not native_byteorder:
  2058. self.data = np.fromiter(zip(*data_cols), dtype=dtypes)
  2059. else:
  2060. self.data = data.to_records(index=False)
  2061. def _write_data(self):
  2062. data = self.data
  2063. self._file.write(data.tobytes())
  2064. def _null_terminate(self, s, as_string=False):
  2065. null_byte = '\x00'
  2066. if compat.PY3 and not as_string:
  2067. s += null_byte
  2068. return s.encode(self._encoding)
  2069. else:
  2070. s += null_byte
  2071. return s
  2072. def _dtype_to_stata_type_117(dtype, column, force_strl):
  2073. """
  2074. Converts dtype types to stata types. Returns the byte of the given ordinal.
  2075. See TYPE_MAP and comments for an explanation. This is also explained in
  2076. the dta spec.
  2077. 1 - 2045 are strings of this length
  2078. Pandas Stata
  2079. 32768 - for object strL
  2080. 65526 - for int8 byte
  2081. 65527 - for int16 int
  2082. 65528 - for int32 long
  2083. 65529 - for float32 float
  2084. 65530 - for double double
  2085. If there are dates to convert, then dtype will already have the correct
  2086. type inserted.
  2087. """
  2088. # TODO: expand to handle datetime to integer conversion
  2089. if force_strl:
  2090. return 32768
  2091. if dtype.type == np.object_: # try to coerce it to the biggest string
  2092. # not memory efficient, what else could we
  2093. # do?
  2094. itemsize = max_len_string_array(ensure_object(column.values))
  2095. itemsize = max(itemsize, 1)
  2096. if itemsize <= 2045:
  2097. return itemsize
  2098. return 32768
  2099. elif dtype == np.float64:
  2100. return 65526
  2101. elif dtype == np.float32:
  2102. return 65527
  2103. elif dtype == np.int32:
  2104. return 65528
  2105. elif dtype == np.int16:
  2106. return 65529
  2107. elif dtype == np.int8:
  2108. return 65530
  2109. else: # pragma : no cover
  2110. raise NotImplementedError("Data type %s not supported." % dtype)
  2111. def _bytes(s, encoding):
  2112. if compat.PY3:
  2113. return bytes(s, encoding)
  2114. else:
  2115. return bytes(s.encode(encoding))
  2116. def _pad_bytes_new(name, length):
  2117. """
  2118. Takes a bytes instance and pads it with null bytes until it's length chars.
  2119. """
  2120. if isinstance(name, string_types):
  2121. name = _bytes(name, 'utf-8')
  2122. return name + b'\x00' * (length - len(name))
  2123. class StataStrLWriter(object):
  2124. """
  2125. Converter for Stata StrLs
  2126. Stata StrLs map 8 byte values to strings which are stored using a
  2127. dictionary-like format where strings are keyed to two values.
  2128. Parameters
  2129. ----------
  2130. df : DataFrame
  2131. DataFrame to convert
  2132. columns : list
  2133. List of columns names to convert to StrL
  2134. version : int, optional
  2135. dta version. Currently supports 117, 118 and 119
  2136. byteorder : str, optional
  2137. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  2138. Notes
  2139. -----
  2140. Supports creation of the StrL block of a dta file for dta versions
  2141. 117, 118 and 119. These differ in how the GSO is stored. 118 and
  2142. 119 store the GSO lookup value as a uint32 and a uint64, while 117
  2143. uses two uint32s. 118 and 119 also encode all strings as unicode
  2144. which is required by the format. 117 uses 'latin-1' a fixed width
  2145. encoding that extends the 7-bit ascii table with an additional 128
  2146. characters.
  2147. """
  2148. def __init__(self, df, columns, version=117, byteorder=None):
  2149. if version not in (117, 118, 119):
  2150. raise ValueError('Only dta versions 117, 118 and 119 supported')
  2151. self._dta_ver = version
  2152. self.df = df
  2153. self.columns = columns
  2154. self._gso_table = OrderedDict((('', (0, 0)),))
  2155. if byteorder is None:
  2156. byteorder = sys.byteorder
  2157. self._byteorder = _set_endianness(byteorder)
  2158. gso_v_type = 'I' # uint32
  2159. gso_o_type = 'Q' # uint64
  2160. self._encoding = 'utf-8'
  2161. if version == 117:
  2162. o_size = 4
  2163. gso_o_type = 'I' # 117 used uint32
  2164. self._encoding = 'latin-1'
  2165. elif version == 118:
  2166. o_size = 6
  2167. else: # version == 119
  2168. o_size = 5
  2169. self._o_offet = 2 ** (8 * (8 - o_size))
  2170. self._gso_o_type = gso_o_type
  2171. self._gso_v_type = gso_v_type
  2172. def _convert_key(self, key):
  2173. v, o = key
  2174. return v + self._o_offet * o
  2175. def generate_table(self):
  2176. """
  2177. Generates the GSO lookup table for the DataFRame
  2178. Returns
  2179. -------
  2180. gso_table : OrderedDict
  2181. Ordered dictionary using the string found as keys
  2182. and their lookup position (v,o) as values
  2183. gso_df : DataFrame
  2184. DataFrame where strl columns have been converted to
  2185. (v,o) values
  2186. Notes
  2187. -----
  2188. Modifies the DataFrame in-place.
  2189. The DataFrame returned encodes the (v,o) values as uint64s. The
  2190. encoding depends on teh dta version, and can be expressed as
  2191. enc = v + o * 2 ** (o_size * 8)
  2192. so that v is stored in the lower bits and o is in the upper
  2193. bits. o_size is
  2194. * 117: 4
  2195. * 118: 6
  2196. * 119: 5
  2197. """
  2198. gso_table = self._gso_table
  2199. gso_df = self.df
  2200. columns = list(gso_df.columns)
  2201. selected = gso_df[self.columns]
  2202. col_index = [(col, columns.index(col)) for col in self.columns]
  2203. keys = np.empty(selected.shape, dtype=np.uint64)
  2204. for o, (idx, row) in enumerate(selected.iterrows()):
  2205. for j, (col, v) in enumerate(col_index):
  2206. val = row[col]
  2207. # Allow columns with mixed str and None (GH 23633)
  2208. val = '' if val is None else val
  2209. key = gso_table.get(val, None)
  2210. if key is None:
  2211. # Stata prefers human numbers
  2212. key = (v + 1, o + 1)
  2213. gso_table[val] = key
  2214. keys[o, j] = self._convert_key(key)
  2215. for i, col in enumerate(self.columns):
  2216. gso_df[col] = keys[:, i]
  2217. return gso_table, gso_df
  2218. def _encode(self, s):
  2219. """
  2220. Python 3 compatibility shim
  2221. """
  2222. if compat.PY3:
  2223. return s.encode(self._encoding)
  2224. else:
  2225. if isinstance(s, text_type):
  2226. return s.encode(self._encoding)
  2227. return s
  2228. def generate_blob(self, gso_table):
  2229. """
  2230. Generates the binary blob of GSOs that is written to the dta file.
  2231. Parameters
  2232. ----------
  2233. gso_table : OrderedDict
  2234. Ordered dictionary (str, vo)
  2235. Returns
  2236. -------
  2237. gso : bytes
  2238. Binary content of dta file to be placed between strl tags
  2239. Notes
  2240. -----
  2241. Output format depends on dta version. 117 uses two uint32s to
  2242. express v and o while 118+ uses a uint32 for v and a uint64 for o.
  2243. """
  2244. # Format information
  2245. # Length includes null term
  2246. # 117
  2247. # GSOvvvvooootllllxxxxxxxxxxxxxxx...x
  2248. # 3 u4 u4 u1 u4 string + null term
  2249. #
  2250. # 118, 119
  2251. # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x
  2252. # 3 u4 u8 u1 u4 string + null term
  2253. bio = BytesIO()
  2254. gso = _bytes('GSO', 'ascii')
  2255. gso_type = struct.pack(self._byteorder + 'B', 130)
  2256. null = struct.pack(self._byteorder + 'B', 0)
  2257. v_type = self._byteorder + self._gso_v_type
  2258. o_type = self._byteorder + self._gso_o_type
  2259. len_type = self._byteorder + 'I'
  2260. for strl, vo in gso_table.items():
  2261. if vo == (0, 0):
  2262. continue
  2263. v, o = vo
  2264. # GSO
  2265. bio.write(gso)
  2266. # vvvv
  2267. bio.write(struct.pack(v_type, v))
  2268. # oooo / oooooooo
  2269. bio.write(struct.pack(o_type, o))
  2270. # t
  2271. bio.write(gso_type)
  2272. # llll
  2273. utf8_string = _bytes(strl, 'utf-8')
  2274. bio.write(struct.pack(len_type, len(utf8_string) + 1))
  2275. # xxx...xxx
  2276. bio.write(utf8_string)
  2277. bio.write(null)
  2278. bio.seek(0)
  2279. return bio.read()
  2280. class StataWriter117(StataWriter):
  2281. """
  2282. A class for writing Stata binary dta files in Stata 13 format (117)
  2283. .. versionadded:: 0.23.0
  2284. Parameters
  2285. ----------
  2286. fname : path (string), buffer or path object
  2287. string, path object (pathlib.Path or py._path.local.LocalPath) or
  2288. object implementing a binary write() functions. If using a buffer
  2289. then the buffer will not be automatically closed after the file
  2290. is written.
  2291. data : DataFrame
  2292. Input to save
  2293. convert_dates : dict
  2294. Dictionary mapping columns containing datetime types to stata internal
  2295. format to use when writing the dates. Options are 'tc', 'td', 'tm',
  2296. 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
  2297. Datetime columns that do not have a conversion type specified will be
  2298. converted to 'tc'. Raises NotImplementedError if a datetime column has
  2299. timezone information
  2300. write_index : bool
  2301. Write the index to Stata dataset.
  2302. encoding : str
  2303. Default is latin-1. Only latin-1 and ascii are supported.
  2304. byteorder : str
  2305. Can be ">", "<", "little", or "big". default is `sys.byteorder`
  2306. time_stamp : datetime
  2307. A datetime to use as file creation date. Default is the current time
  2308. data_label : str
  2309. A label for the data set. Must be 80 characters or smaller.
  2310. variable_labels : dict
  2311. Dictionary containing columns as keys and variable labels as values.
  2312. Each label must be 80 characters or smaller.
  2313. convert_strl : list
  2314. List of columns names to convert to Stata StrL format. Columns with
  2315. more than 2045 characters are aautomatically written as StrL.
  2316. Smaller columns can be converted by including the column name. Using
  2317. StrLs can reduce output file size when strings are longer than 8
  2318. characters, and either frequently repeated or sparse.
  2319. Returns
  2320. -------
  2321. writer : StataWriter117 instance
  2322. The StataWriter117 instance has a write_file method, which will
  2323. write the file to the given `fname`.
  2324. Raises
  2325. ------
  2326. NotImplementedError
  2327. * If datetimes contain timezone information
  2328. ValueError
  2329. * Columns listed in convert_dates are neither datetime64[ns]
  2330. or datetime.datetime
  2331. * Column dtype is not representable in Stata
  2332. * Column listed in convert_dates is not in DataFrame
  2333. * Categorical label contains more than 32,000 characters
  2334. Examples
  2335. --------
  2336. >>> from pandas.io.stata import StataWriter117
  2337. >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c'])
  2338. >>> writer = StataWriter117('./data_file.dta', data)
  2339. >>> writer.write_file()
  2340. Or with long strings stored in strl format
  2341. >>> data = pd.DataFrame([['A relatively long string'], [''], ['']],
  2342. ... columns=['strls'])
  2343. >>> writer = StataWriter117('./data_file_with_long_strings.dta', data,
  2344. ... convert_strl=['strls'])
  2345. >>> writer.write_file()
  2346. """
  2347. _max_string_length = 2045
  2348. @deprecate_kwarg(old_arg_name='encoding', new_arg_name=None)
  2349. def __init__(self, fname, data, convert_dates=None, write_index=True,
  2350. encoding="latin-1", byteorder=None, time_stamp=None,
  2351. data_label=None, variable_labels=None, convert_strl=None):
  2352. # Shallow copy since convert_strl might be modified later
  2353. self._convert_strl = [] if convert_strl is None else convert_strl[:]
  2354. super(StataWriter117, self).__init__(fname, data, convert_dates,
  2355. write_index, byteorder=byteorder,
  2356. time_stamp=time_stamp,
  2357. data_label=data_label,
  2358. variable_labels=variable_labels)
  2359. self._map = None
  2360. self._strl_blob = None
  2361. @staticmethod
  2362. def _tag(val, tag):
  2363. """Surround val with <tag></tag>"""
  2364. if isinstance(val, str) and compat.PY3:
  2365. val = _bytes(val, 'utf-8')
  2366. return (_bytes('<' + tag + '>', 'utf-8') + val +
  2367. _bytes('</' + tag + '>', 'utf-8'))
  2368. def _update_map(self, tag):
  2369. """Update map location for tag with file position"""
  2370. self._map[tag] = self._file.tell()
  2371. def _write_header(self, data_label=None, time_stamp=None):
  2372. """Write the file header"""
  2373. byteorder = self._byteorder
  2374. self._file.write(_bytes('<stata_dta>', 'utf-8'))
  2375. bio = BytesIO()
  2376. # ds_format - 117
  2377. bio.write(self._tag(_bytes('117', 'utf-8'), 'release'))
  2378. # byteorder
  2379. bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", 'byteorder'))
  2380. # number of vars, 2 bytes
  2381. assert self.nvar < 2 ** 16
  2382. bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), 'K'))
  2383. # number of obs, 4 bytes
  2384. bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), 'N'))
  2385. # data label 81 bytes, char, null terminated
  2386. label = data_label[:80] if data_label is not None else ''
  2387. label_len = struct.pack(byteorder + "B", len(label))
  2388. label = label_len + _bytes(label, 'utf-8')
  2389. bio.write(self._tag(label, 'label'))
  2390. # time stamp, 18 bytes, char, null terminated
  2391. # format dd Mon yyyy hh:mm
  2392. if time_stamp is None:
  2393. time_stamp = datetime.datetime.now()
  2394. elif not isinstance(time_stamp, datetime.datetime):
  2395. raise ValueError("time_stamp should be datetime type")
  2396. # Avoid locale-specific month conversion
  2397. months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
  2398. 'Sep', 'Oct', 'Nov', 'Dec']
  2399. month_lookup = {i + 1: month for i, month in enumerate(months)}
  2400. ts = (time_stamp.strftime("%d ") +
  2401. month_lookup[time_stamp.month] +
  2402. time_stamp.strftime(" %Y %H:%M"))
  2403. # '\x11' added due to inspection of Stata file
  2404. ts = b'\x11' + _bytes(ts, 'utf8')
  2405. bio.write(self._tag(ts, 'timestamp'))
  2406. bio.seek(0)
  2407. self._file.write(self._tag(bio.read(), 'header'))
  2408. def _write_map(self):
  2409. """Called twice during file write. The first populates the values in
  2410. the map with 0s. The second call writes the final map locations when
  2411. all blocks have been written."""
  2412. if self._map is None:
  2413. self._map = OrderedDict((('stata_data', 0),
  2414. ('map', self._file.tell()),
  2415. ('variable_types', 0),
  2416. ('varnames', 0),
  2417. ('sortlist', 0),
  2418. ('formats', 0),
  2419. ('value_label_names', 0),
  2420. ('variable_labels', 0),
  2421. ('characteristics', 0),
  2422. ('data', 0),
  2423. ('strls', 0),
  2424. ('value_labels', 0),
  2425. ('stata_data_close', 0),
  2426. ('end-of-file', 0)))
  2427. # Move to start of map
  2428. self._file.seek(self._map['map'])
  2429. bio = BytesIO()
  2430. for val in self._map.values():
  2431. bio.write(struct.pack(self._byteorder + 'Q', val))
  2432. bio.seek(0)
  2433. self._file.write(self._tag(bio.read(), 'map'))
  2434. def _write_variable_types(self):
  2435. self._update_map('variable_types')
  2436. bio = BytesIO()
  2437. for typ in self.typlist:
  2438. bio.write(struct.pack(self._byteorder + 'H', typ))
  2439. bio.seek(0)
  2440. self._file.write(self._tag(bio.read(), 'variable_types'))
  2441. def _write_varnames(self):
  2442. self._update_map('varnames')
  2443. bio = BytesIO()
  2444. for name in self.varlist:
  2445. name = self._null_terminate(name, True)
  2446. name = _pad_bytes_new(name[:32], 33)
  2447. bio.write(name)
  2448. bio.seek(0)
  2449. self._file.write(self._tag(bio.read(), 'varnames'))
  2450. def _write_sortlist(self):
  2451. self._update_map('sortlist')
  2452. self._file.write(self._tag(b'\x00\00' * (self.nvar + 1), 'sortlist'))
  2453. def _write_formats(self):
  2454. self._update_map('formats')
  2455. bio = BytesIO()
  2456. for fmt in self.fmtlist:
  2457. bio.write(_pad_bytes_new(fmt, 49))
  2458. bio.seek(0)
  2459. self._file.write(self._tag(bio.read(), 'formats'))
  2460. def _write_value_label_names(self):
  2461. self._update_map('value_label_names')
  2462. bio = BytesIO()
  2463. for i in range(self.nvar):
  2464. # Use variable name when categorical
  2465. name = '' # default name
  2466. if self._is_col_cat[i]:
  2467. name = self.varlist[i]
  2468. name = self._null_terminate(name, True)
  2469. name = _pad_bytes_new(name[:32], 33)
  2470. bio.write(name)
  2471. bio.seek(0)
  2472. self._file.write(self._tag(bio.read(), 'value_label_names'))
  2473. def _write_variable_labels(self):
  2474. # Missing labels are 80 blank characters plus null termination
  2475. self._update_map('variable_labels')
  2476. bio = BytesIO()
  2477. blank = _pad_bytes_new('', 81)
  2478. if self._variable_labels is None:
  2479. for _ in range(self.nvar):
  2480. bio.write(blank)
  2481. bio.seek(0)
  2482. self._file.write(self._tag(bio.read(), 'variable_labels'))
  2483. return
  2484. for col in self.data:
  2485. if col in self._variable_labels:
  2486. label = self._variable_labels[col]
  2487. if len(label) > 80:
  2488. raise ValueError('Variable labels must be 80 characters '
  2489. 'or fewer')
  2490. is_latin1 = all(ord(c) < 256 for c in label)
  2491. if not is_latin1:
  2492. raise ValueError('Variable labels must contain only '
  2493. 'characters that can be encoded in '
  2494. 'Latin-1')
  2495. bio.write(_pad_bytes_new(label, 81))
  2496. else:
  2497. bio.write(blank)
  2498. bio.seek(0)
  2499. self._file.write(self._tag(bio.read(), 'variable_labels'))
  2500. def _write_characteristics(self):
  2501. self._update_map('characteristics')
  2502. self._file.write(self._tag(b'', 'characteristics'))
  2503. def _write_data(self):
  2504. self._update_map('data')
  2505. data = self.data
  2506. self._file.write(b'<data>')
  2507. self._file.write(data.tobytes())
  2508. self._file.write(b'</data>')
  2509. def _write_strls(self):
  2510. self._update_map('strls')
  2511. strls = b''
  2512. if self._strl_blob is not None:
  2513. strls = self._strl_blob
  2514. self._file.write(self._tag(strls, 'strls'))
  2515. def _write_expansion_fields(self):
  2516. """No-op in dta 117+"""
  2517. pass
  2518. def _write_value_labels(self):
  2519. self._update_map('value_labels')
  2520. bio = BytesIO()
  2521. for vl in self._value_labels:
  2522. lab = vl.generate_value_label(self._byteorder, self._encoding)
  2523. lab = self._tag(lab, 'lbl')
  2524. bio.write(lab)
  2525. bio.seek(0)
  2526. self._file.write(self._tag(bio.read(), 'value_labels'))
  2527. def _write_file_close_tag(self):
  2528. self._update_map('stata_data_close')
  2529. self._file.write(_bytes('</stata_dta>', 'utf-8'))
  2530. self._update_map('end-of-file')
  2531. def _update_strl_names(self):
  2532. """Update column names for conversion to strl if they might have been
  2533. changed to comply with Stata naming rules"""
  2534. # Update convert_strl if names changed
  2535. for orig, new in self._converted_names.items():
  2536. if orig in self._convert_strl:
  2537. idx = self._convert_strl.index(orig)
  2538. self._convert_strl[idx] = new
  2539. def _convert_strls(self, data):
  2540. """Convert columns to StrLs if either very large or in the
  2541. convert_strl variable"""
  2542. convert_cols = [
  2543. col for i, col in enumerate(data)
  2544. if self.typlist[i] == 32768 or col in self._convert_strl]
  2545. if convert_cols:
  2546. ssw = StataStrLWriter(data, convert_cols)
  2547. tab, new_data = ssw.generate_table()
  2548. data = new_data
  2549. self._strl_blob = ssw.generate_blob(tab)
  2550. return data
  2551. def _set_formats_and_types(self, data, dtypes):
  2552. self.typlist = []
  2553. self.fmtlist = []
  2554. for col, dtype in dtypes.iteritems():
  2555. force_strl = col in self._convert_strl
  2556. fmt = _dtype_to_default_stata_fmt(dtype, data[col],
  2557. dta_version=117,
  2558. force_strl=force_strl)
  2559. self.fmtlist.append(fmt)
  2560. self.typlist.append(_dtype_to_stata_type_117(dtype, data[col],
  2561. force_strl))