123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498 |
- # -*- coding: utf-8 -*-
- from datetime import timedelta
- import re
- import numpy as np
- from pytz import AmbiguousTimeError
- from pandas._libs.algos import unique_deltas
- from pandas._libs.tslibs import Timedelta, Timestamp
- from pandas._libs.tslibs.ccalendar import MONTH_ALIASES, int_to_weekday
- from pandas._libs.tslibs.conversion import tz_convert
- from pandas._libs.tslibs.fields import build_field_sarray
- import pandas._libs.tslibs.frequencies as libfreqs
- from pandas._libs.tslibs.offsets import _offset_to_period_map
- import pandas._libs.tslibs.resolution as libresolution
- from pandas._libs.tslibs.resolution import Resolution
- from pandas._libs.tslibs.timezones import UTC
- import pandas.compat as compat
- from pandas.compat import zip
- from pandas.util._decorators import cache_readonly
- from pandas.core.dtypes.common import (
- is_datetime64_dtype, is_period_arraylike, is_timedelta64_dtype)
- from pandas.core.dtypes.generic import ABCSeries
- from pandas.core.algorithms import unique
- from pandas.tseries.offsets import (
- DateOffset, Day, Hour, Micro, Milli, Minute, Nano, Second, prefix_mapping)
- _ONE_MICRO = 1000
- _ONE_MILLI = (_ONE_MICRO * 1000)
- _ONE_SECOND = (_ONE_MILLI * 1000)
- _ONE_MINUTE = (60 * _ONE_SECOND)
- _ONE_HOUR = (60 * _ONE_MINUTE)
- _ONE_DAY = (24 * _ONE_HOUR)
- # ---------------------------------------------------------------------
- # Offset names ("time rules") and related functions
- #: cache of previously seen offsets
- _offset_map = {}
- def get_period_alias(offset_str):
- """ alias to closest period strings BQ->Q etc"""
- return _offset_to_period_map.get(offset_str, None)
- _name_to_offset_map = {'days': Day(1),
- 'hours': Hour(1),
- 'minutes': Minute(1),
- 'seconds': Second(1),
- 'milliseconds': Milli(1),
- 'microseconds': Micro(1),
- 'nanoseconds': Nano(1)}
- def to_offset(freq):
- """
- Return DateOffset object from string or tuple representation
- or datetime.timedelta object
- Parameters
- ----------
- freq : str, tuple, datetime.timedelta, DateOffset or None
- Returns
- -------
- delta : DateOffset
- None if freq is None
- Raises
- ------
- ValueError
- If freq is an invalid frequency
- See Also
- --------
- pandas.DateOffset
- Examples
- --------
- >>> to_offset('5min')
- <5 * Minutes>
- >>> to_offset('1D1H')
- <25 * Hours>
- >>> to_offset(('W', 2))
- <2 * Weeks: weekday=6>
- >>> to_offset((2, 'B'))
- <2 * BusinessDays>
- >>> to_offset(datetime.timedelta(days=1))
- <Day>
- >>> to_offset(Hour())
- <Hour>
- """
- if freq is None:
- return None
- if isinstance(freq, DateOffset):
- return freq
- if isinstance(freq, tuple):
- name = freq[0]
- stride = freq[1]
- if isinstance(stride, compat.string_types):
- name, stride = stride, name
- name, _ = libfreqs._base_and_stride(name)
- delta = get_offset(name) * stride
- elif isinstance(freq, timedelta):
- delta = None
- freq = Timedelta(freq)
- try:
- for name in freq.components._fields:
- offset = _name_to_offset_map[name]
- stride = getattr(freq.components, name)
- if stride != 0:
- offset = stride * offset
- if delta is None:
- delta = offset
- else:
- delta = delta + offset
- except Exception:
- raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
- else:
- delta = None
- stride_sign = None
- try:
- splitted = re.split(libfreqs.opattern, freq)
- if splitted[-1] != '' and not splitted[-1].isspace():
- # the last element must be blank
- raise ValueError('last element must be blank')
- for sep, stride, name in zip(splitted[0::4], splitted[1::4],
- splitted[2::4]):
- if sep != '' and not sep.isspace():
- raise ValueError('separator must be spaces')
- prefix = libfreqs._lite_rule_alias.get(name) or name
- if stride_sign is None:
- stride_sign = -1 if stride.startswith('-') else 1
- if not stride:
- stride = 1
- if prefix in Resolution._reso_str_bump_map.keys():
- stride, name = Resolution.get_stride_from_decimal(
- float(stride), prefix
- )
- stride = int(stride)
- offset = get_offset(name)
- offset = offset * int(np.fabs(stride) * stride_sign)
- if delta is None:
- delta = offset
- else:
- delta = delta + offset
- except Exception:
- raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
- if delta is None:
- raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(freq))
- return delta
- def get_offset(name):
- """
- Return DateOffset object associated with rule name
- Examples
- --------
- get_offset('EOM') --> BMonthEnd(1)
- """
- if name not in libfreqs._dont_uppercase:
- name = name.upper()
- name = libfreqs._lite_rule_alias.get(name, name)
- name = libfreqs._lite_rule_alias.get(name.lower(), name)
- else:
- name = libfreqs._lite_rule_alias.get(name, name)
- if name not in _offset_map:
- try:
- split = name.split('-')
- klass = prefix_mapping[split[0]]
- # handles case where there's no suffix (and will TypeError if too
- # many '-')
- offset = klass._from_name(*split[1:])
- except (ValueError, TypeError, KeyError):
- # bad prefix or suffix
- raise ValueError(libfreqs.INVALID_FREQ_ERR_MSG.format(name))
- # cache
- _offset_map[name] = offset
- return _offset_map[name]
- # ---------------------------------------------------------------------
- # Period codes
- def infer_freq(index, warn=True):
- """
- Infer the most likely frequency given the input index. If the frequency is
- uncertain, a warning will be printed.
- Parameters
- ----------
- index : DatetimeIndex or TimedeltaIndex
- if passed a Series will use the values of the series (NOT THE INDEX)
- warn : boolean, default True
- Returns
- -------
- freq : string or None
- None if no discernible frequency
- TypeError if the index is not datetime-like
- ValueError if there are less than three values.
- """
- import pandas as pd
- if isinstance(index, ABCSeries):
- values = index._values
- if not (is_datetime64_dtype(values) or
- is_timedelta64_dtype(values) or
- values.dtype == object):
- raise TypeError("cannot infer freq from a non-convertible dtype "
- "on a Series of {dtype}".format(dtype=index.dtype))
- index = values
- if is_period_arraylike(index):
- raise TypeError("PeriodIndex given. Check the `freq` attribute "
- "instead of using infer_freq.")
- elif is_timedelta64_dtype(index):
- # Allow TimedeltaIndex and TimedeltaArray
- inferer = _TimedeltaFrequencyInferer(index, warn=warn)
- return inferer.get_freq()
- if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
- if isinstance(index, (pd.Int64Index, pd.Float64Index)):
- raise TypeError("cannot infer freq from a non-convertible index "
- "type {type}".format(type=type(index)))
- index = index.values
- if not isinstance(index, pd.DatetimeIndex):
- try:
- index = pd.DatetimeIndex(index)
- except AmbiguousTimeError:
- index = pd.DatetimeIndex(index.asi8)
- inferer = _FrequencyInferer(index, warn=warn)
- return inferer.get_freq()
- class _FrequencyInferer(object):
- """
- Not sure if I can avoid the state machine here
- """
- def __init__(self, index, warn=True):
- self.index = index
- self.values = index.asi8
- # This moves the values, which are implicitly in UTC, to the
- # the timezone so they are in local time
- if hasattr(index, 'tz'):
- if index.tz is not None:
- self.values = tz_convert(self.values, UTC, index.tz)
- self.warn = warn
- if len(index) < 3:
- raise ValueError('Need at least 3 dates to infer frequency')
- self.is_monotonic = (self.index._is_monotonic_increasing or
- self.index._is_monotonic_decreasing)
- @cache_readonly
- def deltas(self):
- return unique_deltas(self.values)
- @cache_readonly
- def deltas_asi8(self):
- return unique_deltas(self.index.asi8)
- @cache_readonly
- def is_unique(self):
- return len(self.deltas) == 1
- @cache_readonly
- def is_unique_asi8(self):
- return len(self.deltas_asi8) == 1
- def get_freq(self):
- """
- Find the appropriate frequency string to describe the inferred
- frequency of self.values
- Returns
- -------
- freqstr : str or None
- """
- if not self.is_monotonic or not self.index._is_unique:
- return None
- delta = self.deltas[0]
- if _is_multiple(delta, _ONE_DAY):
- return self._infer_daily_rule()
- # Business hourly, maybe. 17: one day / 65: one weekend
- if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
- return 'BH'
- # Possibly intraday frequency. Here we use the
- # original .asi8 values as the modified values
- # will not work around DST transitions. See #8772
- elif not self.is_unique_asi8:
- return None
- delta = self.deltas_asi8[0]
- if _is_multiple(delta, _ONE_HOUR):
- # Hours
- return _maybe_add_count('H', delta / _ONE_HOUR)
- elif _is_multiple(delta, _ONE_MINUTE):
- # Minutes
- return _maybe_add_count('T', delta / _ONE_MINUTE)
- elif _is_multiple(delta, _ONE_SECOND):
- # Seconds
- return _maybe_add_count('S', delta / _ONE_SECOND)
- elif _is_multiple(delta, _ONE_MILLI):
- # Milliseconds
- return _maybe_add_count('L', delta / _ONE_MILLI)
- elif _is_multiple(delta, _ONE_MICRO):
- # Microseconds
- return _maybe_add_count('U', delta / _ONE_MICRO)
- else:
- # Nanoseconds
- return _maybe_add_count('N', delta)
- @cache_readonly
- def day_deltas(self):
- return [x / _ONE_DAY for x in self.deltas]
- @cache_readonly
- def hour_deltas(self):
- return [x / _ONE_HOUR for x in self.deltas]
- @cache_readonly
- def fields(self):
- return build_field_sarray(self.values)
- @cache_readonly
- def rep_stamp(self):
- return Timestamp(self.values[0])
- def month_position_check(self):
- return libresolution.month_position_check(self.fields,
- self.index.dayofweek)
- @cache_readonly
- def mdiffs(self):
- nmonths = self.fields['Y'] * 12 + self.fields['M']
- return unique_deltas(nmonths.astype('i8'))
- @cache_readonly
- def ydiffs(self):
- return unique_deltas(self.fields['Y'].astype('i8'))
- def _infer_daily_rule(self):
- annual_rule = self._get_annual_rule()
- if annual_rule:
- nyears = self.ydiffs[0]
- month = MONTH_ALIASES[self.rep_stamp.month]
- alias = '{prefix}-{month}'.format(prefix=annual_rule, month=month)
- return _maybe_add_count(alias, nyears)
- quarterly_rule = self._get_quarterly_rule()
- if quarterly_rule:
- nquarters = self.mdiffs[0] / 3
- mod_dict = {0: 12, 2: 11, 1: 10}
- month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
- alias = '{prefix}-{month}'.format(prefix=quarterly_rule,
- month=month)
- return _maybe_add_count(alias, nquarters)
- monthly_rule = self._get_monthly_rule()
- if monthly_rule:
- return _maybe_add_count(monthly_rule, self.mdiffs[0])
- if self.is_unique:
- days = self.deltas[0] / _ONE_DAY
- if days % 7 == 0:
- # Weekly
- day = int_to_weekday[self.rep_stamp.weekday()]
- return _maybe_add_count(
- 'W-{day}'.format(day=day), days / 7)
- else:
- return _maybe_add_count('D', days)
- if self._is_business_daily():
- return 'B'
- wom_rule = self._get_wom_rule()
- if wom_rule:
- return wom_rule
- def _get_annual_rule(self):
- if len(self.ydiffs) > 1:
- return None
- if len(unique(self.fields['M'])) > 1:
- return None
- pos_check = self.month_position_check()
- return {'cs': 'AS', 'bs': 'BAS',
- 'ce': 'A', 'be': 'BA'}.get(pos_check)
- def _get_quarterly_rule(self):
- if len(self.mdiffs) > 1:
- return None
- if not self.mdiffs[0] % 3 == 0:
- return None
- pos_check = self.month_position_check()
- return {'cs': 'QS', 'bs': 'BQS',
- 'ce': 'Q', 'be': 'BQ'}.get(pos_check)
- def _get_monthly_rule(self):
- if len(self.mdiffs) > 1:
- return None
- pos_check = self.month_position_check()
- return {'cs': 'MS', 'bs': 'BMS',
- 'ce': 'M', 'be': 'BM'}.get(pos_check)
- def _is_business_daily(self):
- # quick check: cannot be business daily
- if self.day_deltas != [1, 3]:
- return False
- # probably business daily, but need to confirm
- first_weekday = self.index[0].weekday()
- shifts = np.diff(self.index.asi8)
- shifts = np.floor_divide(shifts, _ONE_DAY)
- weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
- return np.all(((weekdays == 0) & (shifts == 3)) |
- ((weekdays > 0) & (weekdays <= 4) & (shifts == 1)))
- def _get_wom_rule(self):
- # wdiffs = unique(np.diff(self.index.week))
- # We also need -47, -49, -48 to catch index spanning year boundary
- # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
- # return None
- weekdays = unique(self.index.weekday)
- if len(weekdays) > 1:
- return None
- week_of_months = unique((self.index.day - 1) // 7)
- # Only attempt to infer up to WOM-4. See #9425
- week_of_months = week_of_months[week_of_months < 4]
- if len(week_of_months) == 0 or len(week_of_months) > 1:
- return None
- # get which week
- week = week_of_months[0] + 1
- wd = int_to_weekday[weekdays[0]]
- return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)
- class _TimedeltaFrequencyInferer(_FrequencyInferer):
- def _infer_daily_rule(self):
- if self.is_unique:
- days = self.deltas[0] / _ONE_DAY
- if days % 7 == 0:
- # Weekly
- wd = int_to_weekday[self.rep_stamp.weekday()]
- alias = 'W-{weekday}'.format(weekday=wd)
- return _maybe_add_count(alias, days / 7)
- else:
- return _maybe_add_count('D', days)
- def _is_multiple(us, mult):
- return us % mult == 0
- def _maybe_add_count(base, count):
- if count != 1:
- assert count == int(count)
- count = int(count)
- return '{count}{base}'.format(count=count, base=base)
- else:
- return base
|