1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Wed Dec 9 09:55:52 2020
- @author: tanya
- """
- from typing import Union, Iterable, Tuple, List
- import pandas as pd
- import numpy as np
- from itertools import accumulate, repeat, takewhile
- from cdplib.log import Log
- def make_expanding_cv(test_proportion: float,
- start_train_proportion: float,
- step_proportion: float = None,
- expanding_test_size: bool = False,
- data_set_size: Union[float, None] = None,
- index: Union[pd.Series, np.ndarray, list, None] = None)\
- -> Union[Iterable[Tuple[List]], None]:
- """
- """
- logger = Log("make_expanding_cv:")
- try:
- assert((index is None) != (data_set_size is None)),\
- "Set index or data_set_size"
- index = index if (index is not None)\
- else pd.Series(range(data_set_size))
- data_set_size = data_set_size or len(index)
- start_train_size = int(start_train_proportion * data_set_size)
- step_size = int(step_proportion * data_set_size)
- test_size = int(test_proportion * data_set_size)
- train_inds_set = (list(range(train_size))
- for train_size in
- takewhile(
- lambda x: x <= data_set_size - test_size,
- accumulate(repeat(start_train_size),
- lambda x, _: x + step_size)))
- for train_inds in train_inds_set:
- if expanding_test_size:
- yield (index[train_inds],
- index[train_inds[-1] + 1:
- train_inds[-1] + 1
- + int(test_proportion*len(train_inds))])
- else:
- yield (index[train_inds],
- index[train_inds[-1] + 1:
- train_inds[-1] + 1 + test_size])
- except Exception as e:
- logger.log_and_raise_error(("Failed to make expanding cv. "
- "Exit with error: {}".format(e)))
- if __name__ == "__main__":
- logger = Log("Test_expanding_cv: ")
- logger.info("Start Testing")
- logger.info("Testing expanding cv: ")
- cv = make_expanding_cv(data_set_size=50,
- test_proportion=0.1,
- start_train_proportion=0.6,
- step_proportion=0.1,
- expanding_test_size=True)
- cv = list(cv)
- logger.info("Testing expanding cv with datetime index")
- cv = make_expanding_cv(
- test_proportion=0.1,
- start_train_proportion=0.6,
- step_proportion=0.1,
- index=pd.date_range(start=pd.to_datetime("2020-01-01"),
- periods=50))
- cv = list(cv)
- logger.info("Finish testing")
|