expanding_cv.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Wed Dec 9 09:55:52 2020
  5. @author: tanya
  6. """
  7. from typing import Union, Iterable, Tuple, List
  8. import pandas as pd
  9. import numpy as np
  10. from itertools import accumulate, repeat, takewhile
  11. from cdplib.log import Log
  12. def make_expanding_cv(test_proportion: float,
  13. start_train_proportion: float,
  14. step_proportion: float = None,
  15. expanding_test_size: bool = False,
  16. data_set_size: Union[float, None] = None,
  17. index: Union[pd.Series, np.ndarray, list, None] = None)\
  18. -> Union[Iterable[Tuple[List]], None]:
  19. """
  20. """
  21. logger = Log("make_expanding_cv:")
  22. try:
  23. assert((index is None) != (data_set_size is None)),\
  24. "Set index or data_set_size"
  25. index = index if (index is not None)\
  26. else pd.Series(range(data_set_size))
  27. data_set_size = data_set_size or len(index)
  28. start_train_size = int(start_train_proportion * data_set_size)
  29. step_size = int(step_proportion * data_set_size)
  30. test_size = int(test_proportion * data_set_size)
  31. train_inds_set = (list(range(train_size))
  32. for train_size in
  33. takewhile(
  34. lambda x: x <= data_set_size - test_size,
  35. accumulate(repeat(start_train_size),
  36. lambda x, _: x + step_size)))
  37. for train_inds in train_inds_set:
  38. if expanding_test_size:
  39. yield (index[train_inds],
  40. index[train_inds[-1] + 1:
  41. train_inds[-1] + 1
  42. + int(test_proportion*len(train_inds))])
  43. else:
  44. yield (index[train_inds],
  45. index[train_inds[-1] + 1:
  46. train_inds[-1] + 1 + test_size])
  47. except Exception as e:
  48. logger.log_and_raise_error(("Failed to make expanding cv. "
  49. "Exit with error: {}".format(e)))
  50. if __name__ == "__main__":
  51. logger = Log("Test_expanding_cv: ")
  52. logger.info("Start Testing")
  53. logger.info("Testing expanding cv: ")
  54. cv = make_expanding_cv(data_set_size=50,
  55. test_proportion=0.1,
  56. start_train_proportion=0.6,
  57. step_proportion=0.1,
  58. expanding_test_size=True)
  59. cv = list(cv)
  60. logger.info("Testing expanding cv with datetime index")
  61. cv = make_expanding_cv(
  62. test_proportion=0.1,
  63. start_train_proportion=0.6,
  64. step_proportion=0.1,
  65. index=pd.date_range(start=pd.to_datetime("2020-01-01"),
  66. periods=50))
  67. cv = list(cv)
  68. logger.info("Finish testing")