frequencies.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. # -*- coding: utf-8 -*-
  2. from datetime import timedelta
  3. import re
  4. import numpy as np
  5. from pytz import AmbiguousTimeError
  6. from pandas._libs.algos import unique_deltas
  7. from pandas._libs.tslibs import Timedelta, Timestamp
  8. from pandas._libs.tslibs.ccalendar import MONTH_ALIASES, int_to_weekday
  9. from pandas._libs.tslibs.conversion import tz_convert
  10. from pandas._libs.tslibs.fields import build_field_sarray
  11. import pandas._libs.tslibs.frequencies as libfreqs
  12. from pandas._libs.tslibs.offsets import _offset_to_period_map
  13. import pandas._libs.tslibs.resolution as libresolution
  14. from pandas._libs.tslibs.resolution import Resolution
  15. from pandas._libs.tslibs.timezones import UTC
  16. import pandas.compat as compat
  17. from pandas.compat import zip
  18. from pandas.util._decorators import cache_readonly
  19. from pandas.core.dtypes.common import (
  20. is_datetime64_dtype, is_period_arraylike, is_timedelta64_dtype)
  21. from pandas.core.dtypes.generic import ABCSeries
  22. from pandas.core.algorithms import unique
  23. from pandas.tseries.offsets import (
  24. DateOffset, Day, Hour, Micro, Milli, Minute, Nano, Second, prefix_mapping)
  25. _ONE_MICRO = 1000
  26. _ONE_MILLI = (_ONE_MICRO * 1000)
  27. _ONE_SECOND = (_ONE_MILLI * 1000)
  28. _ONE_MINUTE = (60 * _ONE_SECOND)
  29. _ONE_HOUR = (60 * _ONE_MINUTE)
  30. _ONE_DAY = (24 * _ONE_HOUR)
  31. # ---------------------------------------------------------------------
  32. # Offset names ("time rules") and related functions
  33. #: cache of previously seen offsets
  34. _offset_map = {}
  35. def get_period_alias(offset_str):
  36. """ alias to closest period strings BQ->Q etc"""
  37. return _offset_to_period_map.get(offset_str, None)
  38. _name_to_offset_map = {'days': Day(1),
  39. 'hours': Hour(1),
  40. 'minutes': Minute(1),
  41. 'seconds': Second(1),
  42. 'milliseconds': Milli(1),
  43. 'microseconds': Micro(1),
  44. 'nanoseconds': Nano(1)}
  45. def to_offset(freq):
  46. """
  47. Return DateOffset object from string or tuple representation
  48. or datetime.timedelta object
  49. Parameters
  50. ----------
  51. freq : str, tuple, datetime.timedelta, DateOffset or None
  52. Returns
  53. -------
  54. delta : DateOffset
  55. None if freq is None
  56. Raises
  57. ------
  58. ValueError
  59. If freq is an invalid frequency
  60. See Also
  61. --------
  62. pandas.DateOffset
  63. Examples
  64. --------
  65. >>> to_offset('5min')
  66. <5 * Minutes>
  67. >>> to_offset('1D1H')
  68. <25 * Hours>
  69. >>> to_offset(('W', 2))
  70. <2 * Weeks: weekday=6>
  71. >>> to_offset((2, 'B'))
  72. <2 * BusinessDays>
  73. >>> to_offset(datetime.timedelta(days=1))
  74. <Day>
  75. >>> to_offset(Hour())
  76. <Hour>
  77. """
  78. if freq is None:
  79. return None
  80. if isinstance(freq, DateOffset):
  81. return freq
  82. if isinstance(freq, tuple):
  83. name = freq[0]
  84. stride = freq[1]
  85. if isinstance(stride, compat.string_types):
  86. name, stride = stride, name
  87. name, _ = libfreqs._base_and_stride(name)
  88. delta = get_offset(name) * stride
  89. elif isinstance(freq, timedelta):
  90. delta = None
  91. freq = Timedelta(freq)
  92. try:
  93. for name in freq.components._fields:
  94. offset = _name_to_offset_map[name]
  95. stride = getattr(freq.components, name)
  96. if stride != 0:
  97. offset = stride * offset
  98. if delta is None:
  99. delta = offset
  100. else:
  101. delta = delta + offset
  102. except Exception:
  103. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
  104. else:
  105. delta = None
  106. stride_sign = None
  107. try:
  108. splitted = re.split(libfreqs.opattern, freq)
  109. if splitted[-1] != '' and not splitted[-1].isspace():
  110. # the last element must be blank
  111. raise ValueError('last element must be blank')
  112. for sep, stride, name in zip(splitted[0::4], splitted[1::4],
  113. splitted[2::4]):
  114. if sep != '' and not sep.isspace():
  115. raise ValueError('separator must be spaces')
  116. prefix = libfreqs._lite_rule_alias.get(name) or name
  117. if stride_sign is None:
  118. stride_sign = -1 if stride.startswith('-') else 1
  119. if not stride:
  120. stride = 1
  121. if prefix in Resolution._reso_str_bump_map.keys():
  122. stride, name = Resolution.get_stride_from_decimal(
  123. float(stride), prefix
  124. )
  125. stride = int(stride)
  126. offset = get_offset(name)
  127. offset = offset * int(np.fabs(stride) * stride_sign)
  128. if delta is None:
  129. delta = offset
  130. else:
  131. delta = delta + offset
  132. except Exception:
  133. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
  134. if delta is None:
  135. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
  136. return delta
  137. def get_offset(name):
  138. """
  139. Return DateOffset object associated with rule name
  140. Examples
  141. --------
  142. get_offset('EOM') --> BMonthEnd(1)
  143. """
  144. if name not in libfreqs._dont_uppercase:
  145. name = name.upper()
  146. name = libfreqs._lite_rule_alias.get(name, name)
  147. name = libfreqs._lite_rule_alias.get(name.lower(), name)
  148. else:
  149. name = libfreqs._lite_rule_alias.get(name, name)
  150. if name not in _offset_map:
  151. try:
  152. split = name.split('-')
  153. klass = prefix_mapping[split[0]]
  154. # handles case where there's no suffix (and will TypeError if too
  155. # many '-')
  156. offset = klass._from_name(*split[1:])
  157. except (ValueError, TypeError, KeyError):
  158. # bad prefix or suffix
  159. raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name))
  160. # cache
  161. _offset_map[name] = offset
  162. return _offset_map[name]
  163. # ---------------------------------------------------------------------
  164. # Period codes
  165. def infer_freq(index, warn=True):
  166. """
  167. Infer the most likely frequency given the input index. If the frequency is
  168. uncertain, a warning will be printed.
  169. Parameters
  170. ----------
  171. index : DatetimeIndex or TimedeltaIndex
  172. if passed a Series will use the values of the series (NOT THE INDEX)
  173. warn : boolean, default True
  174. Returns
  175. -------
  176. freq : string or None
  177. None if no discernible frequency
  178. TypeError if the index is not datetime-like
  179. ValueError if there are less than three values.
  180. """
  181. import pandas as pd
  182. if isinstance(index, ABCSeries):
  183. values = index._values
  184. if not (is_datetime64_dtype(values) or
  185. is_timedelta64_dtype(values) or
  186. values.dtype == object):
  187. raise TypeError("cannot infer freq from a non-convertible dtype "
  188. "on a Series of {dtype}".format(dtype=index.dtype))
  189. index = values
  190. if is_period_arraylike(index):
  191. raise TypeError("PeriodIndex given. Check the `freq` attribute "
  192. "instead of using infer_freq.")
  193. elif is_timedelta64_dtype(index):
  194. # Allow TimedeltaIndex and TimedeltaArray
  195. inferer = _TimedeltaFrequencyInferer(index, warn=warn)
  196. return inferer.get_freq()
  197. if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
  198. if isinstance(index, (pd.Int64Index, pd.Float64Index)):
  199. raise TypeError("cannot infer freq from a non-convertible index "
  200. "type {type}".format(type=type(index)))
  201. index = index.values
  202. if not isinstance(index, pd.DatetimeIndex):
  203. try:
  204. index = pd.DatetimeIndex(index)
  205. except AmbiguousTimeError:
  206. index = pd.DatetimeIndex(index.asi8)
  207. inferer = _FrequencyInferer(index, warn=warn)
  208. return inferer.get_freq()
  209. class _FrequencyInferer(object):
  210. """
  211. Not sure if I can avoid the state machine here
  212. """
  213. def __init__(self, index, warn=True):
  214. self.index = index
  215. self.values = index.asi8
  216. # This moves the values, which are implicitly in UTC, to the
  217. # the timezone so they are in local time
  218. if hasattr(index, 'tz'):
  219. if index.tz is not None:
  220. self.values = tz_convert(self.values, UTC, index.tz)
  221. self.warn = warn
  222. if len(index) < 3:
  223. raise ValueError('Need at least 3 dates to infer frequency')
  224. self.is_monotonic = (self.index._is_monotonic_increasing or
  225. self.index._is_monotonic_decreasing)
  226. @cache_readonly
  227. def deltas(self):
  228. return unique_deltas(self.values)
  229. @cache_readonly
  230. def deltas_asi8(self):
  231. return unique_deltas(self.index.asi8)
  232. @cache_readonly
  233. def is_unique(self):
  234. return len(self.deltas) == 1
  235. @cache_readonly
  236. def is_unique_asi8(self):
  237. return len(self.deltas_asi8) == 1
  238. def get_freq(self):
  239. """
  240. Find the appropriate frequency string to describe the inferred
  241. frequency of self.values
  242. Returns
  243. -------
  244. freqstr : str or None
  245. """
  246. if not self.is_monotonic or not self.index._is_unique:
  247. return None
  248. delta = self.deltas[0]
  249. if _is_multiple(delta, _ONE_DAY):
  250. return self._infer_daily_rule()
  251. # Business hourly, maybe. 17: one day / 65: one weekend
  252. if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
  253. return 'BH'
  254. # Possibly intraday frequency. Here we use the
  255. # original .asi8 values as the modified values
  256. # will not work around DST transitions. See #8772
  257. elif not self.is_unique_asi8:
  258. return None
  259. delta = self.deltas_asi8[0]
  260. if _is_multiple(delta, _ONE_HOUR):
  261. # Hours
  262. return _maybe_add_count('H', delta / _ONE_HOUR)
  263. elif _is_multiple(delta, _ONE_MINUTE):
  264. # Minutes
  265. return _maybe_add_count('T', delta / _ONE_MINUTE)
  266. elif _is_multiple(delta, _ONE_SECOND):
  267. # Seconds
  268. return _maybe_add_count('S', delta / _ONE_SECOND)
  269. elif _is_multiple(delta, _ONE_MILLI):
  270. # Milliseconds
  271. return _maybe_add_count('L', delta / _ONE_MILLI)
  272. elif _is_multiple(delta, _ONE_MICRO):
  273. # Microseconds
  274. return _maybe_add_count('U', delta / _ONE_MICRO)
  275. else:
  276. # Nanoseconds
  277. return _maybe_add_count('N', delta)
  278. @cache_readonly
  279. def day_deltas(self):
  280. return [x / _ONE_DAY for x in self.deltas]
  281. @cache_readonly
  282. def hour_deltas(self):
  283. return [x / _ONE_HOUR for x in self.deltas]
  284. @cache_readonly
  285. def fields(self):
  286. return build_field_sarray(self.values)
  287. @cache_readonly
  288. def rep_stamp(self):
  289. return Timestamp(self.values[0])
  290. def month_position_check(self):
  291. return libresolution.month_position_check(self.fields,
  292. self.index.dayofweek)
  293. @cache_readonly
  294. def mdiffs(self):
  295. nmonths = self.fields['Y'] * 12 + self.fields['M']
  296. return unique_deltas(nmonths.astype('i8'))
  297. @cache_readonly
  298. def ydiffs(self):
  299. return unique_deltas(self.fields['Y'].astype('i8'))
  300. def _infer_daily_rule(self):
  301. annual_rule = self._get_annual_rule()
  302. if annual_rule:
  303. nyears = self.ydiffs[0]
  304. month = MONTH_ALIASES[self.rep_stamp.month]
  305. alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month)
  306. return _maybe_add_count(alias, nyears)
  307. quarterly_rule = self._get_quarterly_rule()
  308. if quarterly_rule:
  309. nquarters = self.mdiffs[0] / 3
  310. mod_dict = {0: 12, 2: 11, 1: 10}
  311. month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
  312. alias = '{prefix}-{month}'.format(prefix=quarterly_rule,
  313. month=month)
  314. return _maybe_add_count(alias, nquarters)
  315. monthly_rule = self._get_monthly_rule()
  316. if monthly_rule:
  317. return _maybe_add_count(monthly_rule, self.mdiffs[0])
  318. if self.is_unique:
  319. days = self.deltas[0] / _ONE_DAY
  320. if days % 7 == 0:
  321. # Weekly
  322. day = int_to_weekday[self.rep_stamp.weekday()]
  323. return _maybe_add_count(
  324. 'W-{day}'.format(day=day), days / 7)
  325. else:
  326. return _maybe_add_count('D', days)
  327. if self._is_business_daily():
  328. return 'B'
  329. wom_rule = self._get_wom_rule()
  330. if wom_rule:
  331. return wom_rule
  332. def _get_annual_rule(self):
  333. if len(self.ydiffs) > 1:
  334. return None
  335. if len(unique(self.fields['M'])) > 1:
  336. return None
  337. pos_check = self.month_position_check()
  338. return {'cs': 'AS', 'bs': 'BAS',
  339. 'ce': 'A', 'be': 'BA'}.get(pos_check)
  340. def _get_quarterly_rule(self):
  341. if len(self.mdiffs) > 1:
  342. return None
  343. if not self.mdiffs[0] % 3 == 0:
  344. return None
  345. pos_check = self.month_position_check()
  346. return {'cs': 'QS', 'bs': 'BQS',
  347. 'ce': 'Q', 'be': 'BQ'}.get(pos_check)
  348. def _get_monthly_rule(self):
  349. if len(self.mdiffs) > 1:
  350. return None
  351. pos_check = self.month_position_check()
  352. return {'cs': 'MS', 'bs': 'BMS',
  353. 'ce': 'M', 'be': 'BM'}.get(pos_check)
  354. def _is_business_daily(self):
  355. # quick check: cannot be business daily
  356. if self.day_deltas != [1, 3]:
  357. return False
  358. # probably business daily, but need to confirm
  359. first_weekday = self.index[0].weekday()
  360. shifts = np.diff(self.index.asi8)
  361. shifts = np.floor_divide(shifts, _ONE_DAY)
  362. weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
  363. return np.all(((weekdays == 0) & (shifts == 3)) |
  364. ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)))
  365. def _get_wom_rule(self):
  366. # wdiffs = unique(np.diff(self.index.week))
  367. # We also need -47, -49, -48 to catch index spanning year boundary
  368. # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
  369. # return None
  370. weekdays = unique(self.index.weekday)
  371. if len(weekdays) > 1:
  372. return None
  373. week_of_months = unique((self.index.day - 1) // 7)
  374. # Only attempt to infer up to WOM-4. See #9425
  375. week_of_months = week_of_months[week_of_months < 4]
  376. if len(week_of_months) == 0 or len(week_of_months) > 1:
  377. return None
  378. # get which week
  379. week = week_of_months[0] + 1
  380. wd = int_to_weekday[weekdays[0]]
  381. return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)
  382. class _TimedeltaFrequencyInferer(_FrequencyInferer):
  383. def _infer_daily_rule(self):
  384. if self.is_unique:
  385. days = self.deltas[0] / _ONE_DAY
  386. if days % 7 == 0:
  387. # Weekly
  388. wd = int_to_weekday[self.rep_stamp.weekday()]
  389. alias = 'W-{weekday}'.format(weekday=wd)
  390. return _maybe_add_count(alias, days / 7)
  391. else:
  392. return _maybe_add_count('D', days)
  393. def _is_multiple(us, mult):
  394. return us % mult == 0
  395. def _maybe_add_count(base, count):
  396. if count != 1:
  397. assert count == int(count)
  398. count = int(count)
  399. return '{count}{base}'.format(count=count, base=base)
  400. else:
  401. return base