#4 Added an abstract class PipelineSelector and two children classes GridsearchPipelineSelector and HyperoptPipelineSelector

Otwarty
tanja chce scalić 36 commity/ów z tanja/new_pipeline_selection do tanja/master
52 zmienionych plików z 3788 dodań i 1496 usunięć
  1. 0 0
      .gitignore
  2. 0 0
      Pipfile
  3. 0 611
      Pipfile.lock
  4. 0 0
      README.md
  5. 0 0
      cdplib/DataExplorer/DataExplorer.py
  6. 0 0
      cdplib/FlattenData.py
  7. 0 0
      cdplib/Singleton_Threadsafe.py
  8. 0 0
      cdplib/__init__.py
  9. 20 20
      cdplib/db_handlers/InfluxdbHandler.py
  10. 0 0
      cdplib/db_handlers/MongodbHandler.py
  11. 1 2
      cdplib/db_handlers/SQLHandler.py
  12. 2 1
      cdplib/db_handlers/__init__.py
  13. 0 0
      cdplib/db_migration/DataFrameToCollection.py
  14. 27 25
      cdplib/db_migration/MigrationCleaning.py
  15. 0 0
      cdplib/db_migration/ParseDbSchema.py
  16. 0 0
      cdplib/db_migration/ParseJsonSchema.py
  17. 0 0
      cdplib/db_migration/ParseMapping.py
  18. 270 0
      cdplib/feature_engineering/StatisticalFeatures.py
  19. 77 0
      cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
  20. 53 0
      cdplib/feature_engineering/StatisticalFeaturesOverTime.py
  21. 173 0
      cdplib/fine_tuning/FineTunedClassiferCV.py
  22. 375 0
      cdplib/gridsearch/GridSearchPipelineSelector.py
  23. 33 0
      cdplib/gridsearch/space_sample.py
  24. 0 798
      cdplib/hyperopt/HyperoptPipelineSelection.py
  25. 496 0
      cdplib/hyperopt/HyperoptPipelineSelector.py
  26. 0 0
      cdplib/hyperopt/__init__.py
  27. 116 0
      cdplib/hyperopt/composed_space_sample.py
  28. 40 0
      cdplib/hyperopt/space_sample.py
  29. 85 0
      cdplib/hyperparameter_space_composer/SpaceComposer.py
  30. 12 11
      cdplib/log.py
  31. 208 0
      cdplib/ml_validation/CVComposer.py
  32. 0 0
      cdplib/ml_validation/__init__.py
  33. 491 0
      cdplib/ml_validation/cross_validate_with_fine_tuning.py
  34. 97 0
      cdplib/ml_validation/expanding_cv.py
  35. 789 0
      cdplib/pipeline_selector/PipelineSelector.py
  36. 0 0
      cdplib/unit_tests/TestFlattenData.py
  37. 0 0
      cdplib/unit_tests/TestLog.py
  38. 0 0
      cdplib/unit_tests/TestMongodbHandler.py
  39. 0 0
      cdplib/unit_tests/invalid_test_schema.json
  40. 0 0
      cdplib/unit_tests/valid_test_schema.json
  41. 21 10
      cdplib/utils/CleaningUtils.py
  42. 28 18
      cdplib/utils/ExceptionsHandler.py
  43. 46 0
      cdplib/utils/LoadingUtils.py
  44. 36 0
      cdplib/utils/TypeConverter.py
  45. 0 0
      cdplib/utils/__init__.py
  46. 0 0
      classes.png
  47. 0 0
      hooks/README.txt
  48. 0 0
      hooks/pre-commit
  49. 0 0
      packages.png
  50. 0 0
      setup.py
  51. 115 0
      tests/testSQLOperations.py
  52. 177 0
      tests/testStatisticalFeatures.py

+ 0 - 0
.gitignore


+ 0 - 0
Pipfile


+ 0 - 611
Pipfile.lock

@@ -1,611 +0,0 @@
-{
-    "_meta": {
-        "hash": {
-            "sha256": "1879ebbd4ee3fe44d9e59091889a69ead4c7b76e81b70de0dd74d12b5266cf42"
-        },
-        "pipfile-spec": 6,
-        "requires": {
-            "python_version": "3"
-        },
-        "sources": [
-            {
-                "name": "pypi",
-                "url": "https://pypi.org/simple",
-                "verify_ssl": true
-            }
-        ]
-    },
-    "default": {
-        "boltons": {
-            "hashes": [
-                "sha256:3dd8a8e3c1886e7f7ba3422b50f55a66e1700161bf01b919d098e7d96dd2d9b6",
-                "sha256:dd362291a460cc1e0c2e91cc6a60da3036ced77099b623112e8f833e6734bdc5"
-            ],
-            "version": "==20.2.1"
-        },
-        "cdplib": {
-            "editable": true,
-            "git": "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git",
-            "ref": "623f7488557e373eb3181bb4099295ed17a53b5c"
-        },
-        "certifi": {
-            "hashes": [
-                "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
-                "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
-            ],
-            "version": "==2020.12.5"
-        },
-        "chardet": {
-            "hashes": [
-                "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
-                "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==4.0.0"
-        },
-        "cloudpickle": {
-            "hashes": [
-                "sha256:3a32d0eb0bc6f4d0c57fbc4f3e3780f7a81e6fee0fa935072884d58ae8e1cc7c",
-                "sha256:9bc994f9e9447593bd0a45371f0e7ac7333710fcf64a4eb9834bf149f4ef2f32"
-            ],
-            "markers": "python_version >= '3.5'",
-            "version": "==1.6.0"
-        },
-        "decorator": {
-            "hashes": [
-                "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
-                "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
-            ],
-            "version": "==4.4.2"
-        },
-        "future": {
-            "hashes": [
-                "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
-            ],
-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==0.18.2"
-        },
-        "greenlet": {
-            "hashes": [
-                "sha256:0a77691f0080c9da8dfc81e23f4e3cffa5accf0f5b56478951016d7cfead9196",
-                "sha256:0ddd77586553e3daf439aa88b6642c5f252f7ef79a39271c25b1d4bf1b7cbb85",
-                "sha256:111cfd92d78f2af0bc7317452bd93a477128af6327332ebf3c2be7df99566683",
-                "sha256:122c63ba795fdba4fc19c744df6277d9cfd913ed53d1a286f12189a0265316dd",
-                "sha256:181300f826625b7fd1182205b830642926f52bd8cdb08b34574c9d5b2b1813f7",
-                "sha256:1a1ada42a1fd2607d232ae11a7b3195735edaa49ea787a6d9e6a53afaf6f3476",
-                "sha256:1bb80c71de788b36cefb0c3bb6bfab306ba75073dbde2829c858dc3ad70f867c",
-                "sha256:1d1d4473ecb1c1d31ce8fd8d91e4da1b1f64d425c1dc965edc4ed2a63cfa67b2",
-                "sha256:292e801fcb3a0b3a12d8c603c7cf340659ea27fd73c98683e75800d9fd8f704c",
-                "sha256:2c65320774a8cd5fdb6e117c13afa91c4707548282464a18cf80243cf976b3e6",
-                "sha256:4365eccd68e72564c776418c53ce3c5af402bc526fe0653722bc89efd85bf12d",
-                "sha256:5352c15c1d91d22902582e891f27728d8dac3bd5e0ee565b6a9f575355e6d92f",
-                "sha256:58ca0f078d1c135ecf1879d50711f925ee238fe773dfe44e206d7d126f5bc664",
-                "sha256:5d4030b04061fdf4cbc446008e238e44936d77a04b2b32f804688ad64197953c",
-                "sha256:5d69bbd9547d3bc49f8a545db7a0bd69f407badd2ff0f6e1a163680b5841d2b0",
-                "sha256:5f297cb343114b33a13755032ecf7109b07b9a0020e841d1c3cedff6602cc139",
-                "sha256:62afad6e5fd70f34d773ffcbb7c22657e1d46d7fd7c95a43361de979f0a45aef",
-                "sha256:647ba1df86d025f5a34043451d7c4a9f05f240bee06277a524daad11f997d1e7",
-                "sha256:719e169c79255816cdcf6dccd9ed2d089a72a9f6c42273aae12d55e8d35bdcf8",
-                "sha256:7cd5a237f241f2764324396e06298b5dee0df580cf06ef4ada0ff9bff851286c",
-                "sha256:875d4c60a6299f55df1c3bb870ebe6dcb7db28c165ab9ea6cdc5d5af36bb33ce",
-                "sha256:90b6a25841488cf2cb1c8623a53e6879573010a669455046df5f029d93db51b7",
-                "sha256:94620ed996a7632723a424bccb84b07e7b861ab7bb06a5aeb041c111dd723d36",
-                "sha256:b5f1b333015d53d4b381745f5de842f19fe59728b65f0fbb662dafbe2018c3a5",
-                "sha256:c5b22b31c947ad8b6964d4ed66776bcae986f73669ba50620162ba7c832a6b6a",
-                "sha256:c93d1a71c3fe222308939b2e516c07f35a849c5047f0197442a4d6fbcb4128ee",
-                "sha256:cdb90267650c1edb54459cdb51dab865f6c6594c3a47ebd441bc493360c7af70",
-                "sha256:cfd06e0f0cc8db2a854137bd79154b61ecd940dce96fad0cba23fe31de0b793c",
-                "sha256:d3789c1c394944084b5e57c192889985a9f23bd985f6d15728c745d380318128",
-                "sha256:da7d09ad0f24270b20f77d56934e196e982af0d0a2446120cb772be4e060e1a2",
-                "sha256:df3e83323268594fa9755480a442cabfe8d82b21aba815a71acf1bb6c1776218",
-                "sha256:df8053867c831b2643b2c489fe1d62049a98566b1646b194cc815f13e27b90df",
-                "sha256:e1128e022d8dce375362e063754e129750323b67454cac5600008aad9f54139e",
-                "sha256:e6e9fdaf6c90d02b95e6b0709aeb1aba5affbbb9ccaea5502f8638e4323206be",
-                "sha256:eac8803c9ad1817ce3d8d15d1bb82c2da3feda6bee1153eec5c58fa6e5d3f770",
-                "sha256:eb333b90036358a0e2c57373f72e7648d7207b76ef0bd00a4f7daad1f79f5203",
-                "sha256:ed1d1351f05e795a527abc04a0d82e9aecd3bdf9f46662c36ff47b0b00ecaf06",
-                "sha256:f3dc68272990849132d6698f7dc6df2ab62a88b0d36e54702a8fd16c0490e44f",
-                "sha256:f59eded163d9752fd49978e0bab7a1ff21b1b8d25c05f0995d140cc08ac83379",
-                "sha256:f5e2d36c86c7b03c94b8459c3bd2c9fe2c7dab4b258b8885617d44a22e453fb7",
-                "sha256:f6f65bf54215e4ebf6b01e4bb94c49180a589573df643735107056f7a910275b",
-                "sha256:f8450d5ef759dbe59f84f2c9f77491bb3d3c44bc1a573746daf086e70b14c243",
-                "sha256:f97d83049715fd9dec7911860ecf0e17b48d8725de01e45de07d8ac0bd5bc378"
-            ],
-            "markers": "python_version >= '3'",
-            "version": "==1.0.0"
-        },
-        "hyperopt": {
-            "hashes": [
-                "sha256:bc6047d50f956ae64eebcb34b1fd40f186a93e214957f20e87af2f10195295cc",
-                "sha256:dc5c7cceaf33c125b727cf92709e70035d94dd507831dae66406ac762a18a253"
-            ],
-            "index": "pypi",
-            "version": "==0.2.5"
-        },
-        "idna": {
-            "hashes": [
-                "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
-                "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.10"
-        },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:c9db46394197244adf2f0b08ec5bc3cf16757e9590b02af1fca085c16c0d600a",
-                "sha256:d2d46ef77ffc85cbf7dac7e81dd663fde71c45326131bea8033b9bad42268ebe"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==3.10.0"
-        },
-        "influxdb": {
-            "hashes": [
-                "sha256:46f85e7b04ee4b3dee894672be6a295c94709003a7ddea8820deec2ac4d8b27a",
-                "sha256:65040a1f53d1a2a4f88a677e89e3a98189a7d30cf2ab61c318aaa89733280747"
-            ],
-            "index": "pypi",
-            "version": "==5.3.1"
-        },
-        "joblib": {
-            "hashes": [
-                "sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7",
-                "sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==1.0.1"
-        },
-        "jsonref": {
-            "hashes": [
-                "sha256:b1e82fa0b62e2c2796a13e5401fe51790b248f6d9bf9d7212a3e31a3501b291f",
-                "sha256:f3c45b121cf6257eafabdc3a8008763aed1cd7da06dbabc59a9e4d2a5e4e6697"
-            ],
-            "index": "pypi",
-            "version": "==0.2"
-        },
-        "msgpack": {
-            "hashes": [
-                "sha256:0cb94ee48675a45d3b86e61d13c1e6f1696f0183f0715544976356ff86f741d9",
-                "sha256:1026dcc10537d27dd2d26c327e552f05ce148977e9d7b9f1718748281b38c841",
-                "sha256:26a1759f1a88df5f1d0b393eb582ec022326994e311ba9c5818adc5374736439",
-                "sha256:2a5866bdc88d77f6e1370f82f2371c9bc6fc92fe898fa2dec0c5d4f5435a2694",
-                "sha256:31c17bbf2ae5e29e48d794c693b7ca7a0c73bd4280976d408c53df421e838d2a",
-                "sha256:497d2c12426adcd27ab83144057a705efb6acc7e85957a51d43cdcf7f258900f",
-                "sha256:5a9ee2540c78659a1dd0b110f73773533ee3108d4e1219b5a15a8d635b7aca0e",
-                "sha256:8521e5be9e3b93d4d5e07cb80b7e32353264d143c1f072309e1863174c6aadb1",
-                "sha256:87869ba567fe371c4555d2e11e4948778ab6b59d6cc9d8460d543e4cfbbddd1c",
-                "sha256:8ffb24a3b7518e843cd83538cf859e026d24ec41ac5721c18ed0c55101f9775b",
-                "sha256:92be4b12de4806d3c36810b0fe2aeedd8d493db39e2eb90742b9c09299eb5759",
-                "sha256:9ea52fff0473f9f3000987f313310208c879493491ef3ccf66268eff8d5a0326",
-                "sha256:a4355d2193106c7aa77c98fc955252a737d8550320ecdb2e9ac701e15e2943bc",
-                "sha256:a99b144475230982aee16b3d249170f1cccebf27fb0a08e9f603b69637a62192",
-                "sha256:ac25f3e0513f6673e8b405c3a80500eb7be1cf8f57584be524c4fa78fe8e0c83",
-                "sha256:b28c0876cce1466d7c2195d7658cf50e4730667196e2f1355c4209444717ee06",
-                "sha256:b55f7db883530b74c857e50e149126b91bb75d35c08b28db12dcb0346f15e46e",
-                "sha256:b6d9e2dae081aa35c44af9c4298de4ee72991305503442a5c74656d82b581fe9",
-                "sha256:c747c0cc08bd6d72a586310bda6ea72eeb28e7505990f342552315b229a19b33",
-                "sha256:d6c64601af8f3893d17ec233237030e3110f11b8a962cb66720bf70c0141aa54",
-                "sha256:d8167b84af26654c1124857d71650404336f4eb5cc06900667a493fc619ddd9f",
-                "sha256:de6bd7990a2c2dabe926b7e62a92886ccbf809425c347ae7de277067f97c2887",
-                "sha256:e36a812ef4705a291cdb4a2fd352f013134f26c6ff63477f20235138d1d21009",
-                "sha256:e89ec55871ed5473a041c0495b7b4e6099f6263438e0bd04ccd8418f92d5d7f2",
-                "sha256:f3e6aaf217ac1c7ce1563cf52a2f4f5d5b1f64e8729d794165db71da57257f0c",
-                "sha256:f484cd2dca68502de3704f056fa9b318c94b1539ed17a4c784266df5d6978c87",
-                "sha256:fae04496f5bc150eefad4e9571d1a76c55d021325dcd484ce45065ebbdd00984",
-                "sha256:fe07bc6735d08e492a327f496b7850e98cb4d112c56df69b0c844dbebcbb47f6"
-            ],
-            "version": "==1.0.2"
-        },
-        "mysql": {
-            "hashes": [
-                "sha256:55e66b5e7b3823b1da5fb2a063e95a628fb850b2a0b76bdcd884faac5d2daa7d"
-            ],
-            "index": "pypi",
-            "version": "==0.0.2"
-        },
-        "mysqlclient": {
-            "hashes": [
-                "sha256:0ac0dd759c4ca02c35a9fedc24bc982cf75171651e8187c2495ec957a87dfff7",
-                "sha256:3381ca1a4f37ff1155fcfde20836b46416d66531add8843f6aa6d968982731c3",
-                "sha256:71c4b330cf2313bbda0307fc858cc9055e64493ba9bf28454d25cf8b3ee8d7f5",
-                "sha256:f6ebea7c008f155baeefe16c56cd3ee6239f7a5a9ae42396c2f1860f08a7c432",
-                "sha256:fc575093cf81b6605bed84653e48b277318b880dc9becf42dd47fa11ffd3e2b6"
-            ],
-            "markers": "python_version >= '3.5'",
-            "version": "==2.0.3"
-        },
-        "networkx": {
-            "hashes": [
-                "sha256:0635858ed7e989f4c574c2328380b452df892ae85084144c73d8cd819f0c4e06",
-                "sha256:109cd585cac41297f71103c3c42ac6ef7379f29788eb54cb751be5a663bb235a"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==2.5.1"
-        },
-        "numpy": {
-            "hashes": [
-                "sha256:2428b109306075d89d21135bdd6b785f132a1f5a3260c371cee1fae427e12727",
-                "sha256:377751954da04d4a6950191b20539066b4e19e3b559d4695399c5e8e3e683bf6",
-                "sha256:4703b9e937df83f5b6b7447ca5912b5f5f297aba45f91dbbbc63ff9278c7aa98",
-                "sha256:471c0571d0895c68da309dacee4e95a0811d0a9f9f532a48dc1bea5f3b7ad2b7",
-                "sha256:61d5b4cf73622e4d0c6b83408a16631b670fc045afd6540679aa35591a17fe6d",
-                "sha256:6c915ee7dba1071554e70a3664a839fbc033e1d6528199d4621eeaaa5487ccd2",
-                "sha256:6e51e417d9ae2e7848314994e6fc3832c9d426abce9328cf7571eefceb43e6c9",
-                "sha256:719656636c48be22c23641859ff2419b27b6bdf844b36a2447cb39caceb00935",
-                "sha256:780ae5284cb770ade51d4b4a7dce4faa554eb1d88a56d0e8b9f35fca9b0270ff",
-                "sha256:878922bf5ad7550aa044aa9301d417e2d3ae50f0f577de92051d739ac6096cee",
-                "sha256:924dc3f83de20437de95a73516f36e09918e9c9c18d5eac520062c49191025fb",
-                "sha256:97ce8b8ace7d3b9288d88177e66ee75480fb79b9cf745e91ecfe65d91a856042",
-                "sha256:9c0fab855ae790ca74b27e55240fe4f2a36a364a3f1ebcfd1fb5ac4088f1cec3",
-                "sha256:9cab23439eb1ebfed1aaec9cd42b7dc50fc96d5cd3147da348d9161f0501ada5",
-                "sha256:a8e6859913ec8eeef3dbe9aed3bf475347642d1cdd6217c30f28dee8903528e6",
-                "sha256:aa046527c04688af680217fffac61eec2350ef3f3d7320c07fd33f5c6e7b4d5f",
-                "sha256:abc81829c4039e7e4c30f7897938fa5d4916a09c2c7eb9b244b7a35ddc9656f4",
-                "sha256:bad70051de2c50b1a6259a6df1daaafe8c480ca98132da98976d8591c412e737",
-                "sha256:c73a7975d77f15f7f68dacfb2bca3d3f479f158313642e8ea9058eea06637931",
-                "sha256:d15007f857d6995db15195217afdbddfcd203dfaa0ba6878a2f580eaf810ecd6",
-                "sha256:d76061ae5cab49b83a8cf3feacefc2053fac672728802ac137dd8c4123397677",
-                "sha256:e8e4fbbb7e7634f263c5b0150a629342cc19b47c5eba8d1cd4363ab3455ab576",
-                "sha256:e9459f40244bb02b2f14f6af0cd0732791d72232bbb0dc4bab57ef88e75f6935",
-                "sha256:edb1f041a9146dcf02cd7df7187db46ab524b9af2515f392f337c7cbbf5b52cd"
-            ],
-            "markers": "python_version >= '3.7'",
-            "version": "==1.20.2"
-        },
-        "pandas": {
-            "hashes": [
-                "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b",
-                "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f",
-                "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648",
-                "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb",
-                "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98",
-                "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a",
-                "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11",
-                "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a",
-                "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e",
-                "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae",
-                "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d",
-                "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9",
-                "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b",
-                "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788",
-                "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca",
-                "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5",
-                "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a",
-                "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814",
-                "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d",
-                "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086",
-                "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a",
-                "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb",
-                "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782",
-                "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"
-            ],
-            "index": "pypi",
-            "version": "==1.1.5"
-        },
-        "pymongo": {
-            "hashes": [
-                "sha256:0384d76b409278ddb34ac19cdc4664511685959bf719adbdc051875ded4689aa",
-                "sha256:05e2bda928a3a6bc6ddff9e5a8579d41928b75d7417b18f9a67c82bb52150ac6",
-                "sha256:152e4ac3158b776135d8fce28d2ac06e682b885fcbe86690d66465f262ab244e",
-                "sha256:180511abfef70feb022360b35f4863dd68e08334197089201d5c52208de9ca2e",
-                "sha256:19d52c60dc37520385f538d6d1a4c40bc398e0885f4ed6a36ce10b631dab2852",
-                "sha256:1d559a76ae87143ad96c2ecd6fdd38e691721e175df7ced3fcdc681b4638bca1",
-                "sha256:210ec4a058480b9c3869082e52b66d80c4a48eda9682d7a569a1a5a48100ea54",
-                "sha256:2163d736d6f62b20753be5da3dc07a188420b355f057fcbb3075b05ee6227b2f",
-                "sha256:22ee2c94fee1e391735be63aa1c9af4c69fdcb325ae9e5e4ddff770248ef60a6",
-                "sha256:28633868be21a187702a8613913e13d1987d831529358c29fc6f6670413df040",
-                "sha256:29390c39ca873737689a0749c9c3257aad96b323439b11279fbc0ba8626ec9c5",
-                "sha256:2aeb108da1ed8e066800fb447ba5ae89d560e6773d228398a87825ac3630452d",
-                "sha256:322f6cc7bf23a264151ebc5229a92600c4b55ac83c83c91c9bab1ec92c888a8d",
-                "sha256:34c15f5798f23488e509eae82fbf749c3d17db74379a88c07c869ece1aa806b9",
-                "sha256:3873866534b6527e6863e742eb23ea2a539e3c7ee00ad3f9bec9da27dbaaff6f",
-                "sha256:3dbc67754882d740f17809342892f0b24398770bd99d48c5cb5ba89f5f5dee4e",
-                "sha256:413b18ac2222f5d961eb8d1c8dcca6c6ca176c8613636d8c13aa23abae7f7a21",
-                "sha256:42f9ec9d77358f557fe17cc15e796c4d4d492ede1a30cba3664822cae66e97c5",
-                "sha256:4ac387ac1be71b798d1c372a924f9c30352f30e684e06f086091297352698ac0",
-                "sha256:4ca92e15fcf02e02e7c24b448a16599b98c9d0e6a46cd85cc50804450ebf7245",
-                "sha256:4d959e929cec805c2bf391418b1121590b4e7d5cb00af7b1ba521443d45a0918",
-                "sha256:5091aacbdb667b418b751157f48f6daa17142c4f9063d58e5a64c90b2afbdf9a",
-                "sha256:5a03ae5ac85b04b2034a0689add9ff597b16d5e24066a87f6ab0e9fa67049156",
-                "sha256:5e1341276ce8b7752db9aeac6bbb0cbe82a3f6a6186866bf6b4906d8d328d50b",
-                "sha256:6043d251fac27ca04ff22ed8deb5ff7a43dc18e8a4a15b4c442d2a20fa313162",
-                "sha256:610d5cbbfd026e2f6d15665af51e048e49b68363fedece2ed318cc8fe080dd94",
-                "sha256:622a5157ffcd793d305387c1c9fb94185f496c8c9fd66dafb59de0807bc14ad7",
-                "sha256:65b67637f0a25ac9d25efb13c1578eb065870220ffa82f132c5b2d8e43ac39c3",
-                "sha256:66573c8c7808cce4f3b56c23cb7cad6c3d7f4c464b9016d35f5344ad743896d7",
-                "sha256:66b688fc139c6742057795510e3b12c4acbf90d11af1eff9689a41d9c84478d6",
-                "sha256:685b884fa41bd2913fd20af85866c4ff886b7cbb7e4833b918996aa5d45a04be",
-                "sha256:6a5834e392c97f19f36670e34bf9d346d733ad89ee0689a6419dd737dfa4308a",
-                "sha256:728313cc0d59d1a1a004f675607dcf5c711ced3f55e75d82b3f264fd758869f3",
-                "sha256:733e1cfffc4cd99848230e2999c8a86e284c6af6746482f8ad2ad554dce14e39",
-                "sha256:7814b2cf23aad23464859973c5cd2066ca2fd99e0b934acefbb0b728ac2525bf",
-                "sha256:7c77801620e5e75fb9c7abae235d3cc45d212a67efa98f4972eef63e736a8daa",
-                "sha256:7cd42c66d49ffb68dea065e1c8a4323e7ceab386e660fee9863d4fa227302ba9",
-                "sha256:7d2ae2f7c50adec20fde46a73465de31a6a6fbb4903240f8b7304549752ca7a1",
-                "sha256:7edff02e44dd0badd749d7342e40705a398d98c5d8f7570f57cff9568c2351fa",
-                "sha256:87981008d565f647142869d99915cc4760b7725858da3d39ecb2a606e23f36fd",
-                "sha256:92e2376ce3ca0e3e443b3c5c2bb5d584c7e59221edfb0035313c6306049ba55a",
-                "sha256:950710f7370613a6bfa2ccd842b488c5b8072e83fb6b7d45d99110bf44651d06",
-                "sha256:980527f4ccc6644855bb68056fe7835da6d06d37776a52df5bcc1882df57c3db",
-                "sha256:9fbffc5bad4df99a509783cbd449ed0d24fcd5a450c28e7756c8f20eda3d2aa5",
-                "sha256:a8b02e0119d6ee381a265d8d2450a38096f82916d895fed2dfd81d4c7a54d6e4",
-                "sha256:b17e627844d86031c77147c40bf992a6e1114025a460874deeda6500d0f34862",
-                "sha256:b1aa62903a2c5768b0001632efdea2e8da6c80abdd520c2e8a16001cc9affb23",
-                "sha256:b32e4eed2ef19a20dfb57698497a9bc54e74efb2e260c003e9056c145f130dc7",
-                "sha256:b44fa04720bbfd617b6aef036989c8c30435f11450c0a59136291d7b41ed647f",
-                "sha256:b4535d98df83abebb572035754fb3d4ad09ce7449375fa09fa9ede2dbc87b62b",
-                "sha256:bb6a5777bf558f444cd4883d617546182cfeff8f2d4acd885253f11a16740534",
-                "sha256:bc2eb67387b8376120a2be6cba9d23f9d6a6c3828e00fb0a64c55ad7b54116d1",
-                "sha256:bd351ceb2decd23d523fc50bad631ee9ae6e97e7cdc355ce5600fe310484f96e",
-                "sha256:bf70097bd497089f1baabf9cbb3ec4f69c022dc7a70c41ba9c238fa4d0fff7ab",
-                "sha256:c7fd18d4b7939408df9315fedbdb05e179760960a92b3752498e2fcd03f24c3d",
-                "sha256:cc359e408712faf9ea775f4c0ec8f2bfc843afe47747a657808d9595edd34d71",
-                "sha256:cd8fc35d4c0c717cc29b0cb894871555cb7137a081e179877ecc537e2607f0b9",
-                "sha256:daa44cefde19978af57ac1d50413cd86ebf2b497328e7a27832f5824bda47439",
-                "sha256:db5098587f58fbf8582d9bda2462762b367207246d3e19623782fb449c3c5fcc",
-                "sha256:db6fd53ef5f1914ad801830406440c3bfb701e38a607eda47c38adba267ba300",
-                "sha256:e1414599a97554d451e441afb362dbee1505e4550852c0068370d843757a3fe2",
-                "sha256:ee42a8f850143ae7c67ea09a183a6a4ad8d053e1dbd9a1134e21a7b5c1bc6c73",
-                "sha256:f23abcf6eca5859a2982beadfb5111f8c5e76e30ff99aaee3c1c327f814f9f10",
-                "sha256:f6748c447feeadda059719ef5ab1fb9d84bd370e205b20049a0e8b45ef4ad593"
-            ],
-            "index": "pypi",
-            "version": "==3.11.3"
-        },
-        "pymysql": {
-            "hashes": [
-                "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641",
-                "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"
-            ],
-            "index": "pypi",
-            "version": "==1.0.2"
-        },
-        "python-dateutil": {
-            "hashes": [
-                "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
-                "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==2.8.1"
-        },
-        "pytz": {
-            "hashes": [
-                "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
-                "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
-            ],
-            "version": "==2021.1"
-        },
-        "requests": {
-            "hashes": [
-                "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
-                "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
-            "version": "==2.25.1"
-        },
-        "scikit-learn": {
-            "hashes": [
-                "sha256:0567a2d29ad08af98653300c623bd8477b448fe66ced7198bef4ed195925f082",
-                "sha256:087dfede39efb06ab30618f9ab55a0397f29c38d63cd0ab88d12b500b7d65fd7",
-                "sha256:1adf483e91007a87171d7ce58c34b058eb5dab01b5fee6052f15841778a8ecd8",
-                "sha256:259ec35201e82e2db1ae2496f229e63f46d7f1695ae68eef9350b00dc74ba52f",
-                "sha256:3c4f07f47c04e81b134424d53c3f5e16dfd7f494e44fd7584ba9ce9de2c5e6c1",
-                "sha256:4562dcf4793e61c5d0f89836d07bc37521c3a1889da8f651e2c326463c4bd697",
-                "sha256:4ddd2b6f7449a5d539ff754fa92d75da22de261fd8fdcfb3596799fadf255101",
-                "sha256:54be0a60a5a35005ad69c75902e0f5c9f699db4547ead427e97ef881c3242e6f",
-                "sha256:5580eba7345a4d3b097be2f067cc71a306c44bab19e8717a30361f279c929bea",
-                "sha256:7b04691eb2f41d2c68dbda8d1bd3cb4ef421bdc43aaa56aeb6c762224552dfb6",
-                "sha256:826b92bf45b8ad80444814e5f4ac032156dd481e48d7da33d611f8fe96d5f08b",
-                "sha256:83b21ff053b1ff1c018a2d24db6dd3ea339b1acfbaa4d9c881731f43748d8b3b",
-                "sha256:8772b99d683be8f67fcc04789032f1b949022a0e6880ee7b75a7ec97dbbb5d0b",
-                "sha256:895dbf2030aa7337649e36a83a007df3c9811396b4e2fa672a851160f36ce90c",
-                "sha256:8aa1b3ac46b80eaa552b637eeadbbce3be5931e4b5002b964698e33a1b589e1e",
-                "sha256:9599a3f3bf33f73fed0fe06d1dfa4e6081365a58c1c807acb07271be0dce9733",
-                "sha256:99349d77f54e11f962d608d94dfda08f0c9e5720d97132233ebdf35be2858b2d",
-                "sha256:9a24d1ccec2a34d4cd3f2a1f86409f3f5954cc23d4d2270ba0d03cf018aa4780",
-                "sha256:9bed8a1ef133c8e2f13966a542cb8125eac7f4b67dcd234197c827ba9c7dd3e0",
-                "sha256:9c6097b6a9b2bafc5e0f31f659e6ab5e131383209c30c9e978c5b8abdac5ed2a",
-                "sha256:9dfa564ef27e8e674aa1cc74378416d580ac4ede1136c13dd555a87996e13422",
-                "sha256:a0334a1802e64d656022c3bfab56a73fbd6bf4b1298343f3688af2151810bbdf",
-                "sha256:a29460499c1e62b7a830bb57ca42e615375a6ab1bcad053cd25b493588348ea8",
-                "sha256:a36e159a0521e13bbe15ca8c8d038b3a1dd4c7dad18d276d76992e03b92cf643",
-                "sha256:abe835a851610f87201819cb315f8d554e1a3e8128912783a31e87264ba5ffb7",
-                "sha256:c13ebac42236b1c46397162471ea1c46af68413000e28b9309f8c05722c65a09",
-                "sha256:c3deb3b19dd9806acf00cf0d400e84562c227723013c33abefbbc3cf906596e9",
-                "sha256:c658432d8a20e95398f6bb95ff9731ce9dfa343fdf21eea7ec6a7edfacd4b4d9",
-                "sha256:c7f4eb77504ac586d8ac1bde1b0c04b504487210f95297235311a0ab7edd7e38",
-                "sha256:d54dbaadeb1425b7d6a66bf44bee2bb2b899fe3e8850b8e94cfb9c904dcb46d0",
-                "sha256:ddb52d088889f5596bc4d1de981f2eca106b58243b6679e4782f3ba5096fd645",
-                "sha256:ed9d65594948678827f4ff0e7ae23344e2f2b4cabbca057ccaed3118fdc392ca",
-                "sha256:fab31f48282ebf54dd69f6663cd2d9800096bad1bb67bbc9c9ac84eb77b41972"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==0.24.1"
-        },
-        "scipy": {
-            "hashes": [
-                "sha256:03f1fd3574d544456325dae502facdf5c9f81cbfe12808a5e67a737613b7ba8c",
-                "sha256:0c81ea1a95b4c9e0a8424cf9484b7b8fa7ef57169d7bcc0dfcfc23e3d7c81a12",
-                "sha256:1fba8a214c89b995e3721670e66f7053da82e7e5d0fe6b31d8e4b19922a9315e",
-                "sha256:37f4c2fb904c0ba54163e03993ce3544c9c5cde104bcf90614f17d85bdfbb431",
-                "sha256:50e5bcd9d45262725e652611bb104ac0919fd25ecb78c22f5282afabd0b2e189",
-                "sha256:6ca1058cb5bd45388041a7c3c11c4b2bd58867ac9db71db912501df77be2c4a4",
-                "sha256:77f7a057724545b7e097bfdca5c6006bed8580768cd6621bb1330aedf49afba5",
-                "sha256:816951e73d253a41fa2fd5f956f8e8d9ac94148a9a2039e7db56994520582bf2",
-                "sha256:96620240b393d155097618bcd6935d7578e85959e55e3105490bbbf2f594c7ad",
-                "sha256:993c86513272bc84c451349b10ee4376652ab21f312b0554fdee831d593b6c02",
-                "sha256:adf7cee8e5c92b05f2252af498f77c7214a2296d009fc5478fc432c2f8fb953b",
-                "sha256:bc52d4d70863141bb7e2f8fd4d98e41d77375606cde50af65f1243ce2d7853e8",
-                "sha256:c1d3f771c19af00e1a36f749bd0a0690cc64632783383bc68f77587358feb5a4",
-                "sha256:d744657c27c128e357de2f0fd532c09c84cd6e4933e8232895a872e67059ac37",
-                "sha256:e3e9742bad925c421d39e699daa8d396c57535582cba90017d17f926b61c1552",
-                "sha256:e547f84cd52343ac2d56df0ab08d3e9cc202338e7d09fafe286d6c069ddacb31",
-                "sha256:e89091e6a8e211269e23f049473b2fde0c0e5ae0dd5bd276c3fc91b97da83480",
-                "sha256:e9da33e21c9bc1b92c20b5328adb13e5f193b924c9b969cd700c8908f315aa59",
-                "sha256:ffdfb09315896c6e9ac739bb6e13a19255b698c24e6b28314426fd40a1180822"
-            ],
-            "markers": "python_version < '3.10' and python_version >= '3.7'",
-            "version": "==1.6.2"
-        },
-        "simplejson": {
-            "hashes": [
-                "sha256:034550078a11664d77bc1a8364c90bb7eef0e44c2dbb1fd0a4d92e3997088667",
-                "sha256:05b43d568300c1cd43f95ff4bfcff984bc658aa001be91efb3bb21df9d6288d3",
-                "sha256:0dd9d9c738cb008bfc0862c9b8fa6743495c03a0ed543884bf92fb7d30f8d043",
-                "sha256:10fc250c3edea4abc15d930d77274ddb8df4803453dde7ad50c2f5565a18a4bb",
-                "sha256:2862beabfb9097a745a961426fe7daf66e1714151da8bb9a0c430dde3d59c7c0",
-                "sha256:292c2e3f53be314cc59853bd20a35bf1f965f3bc121e007ab6fd526ed412a85d",
-                "sha256:2d3eab2c3fe52007d703a26f71cf649a8c771fcdd949a3ae73041ba6797cfcf8",
-                "sha256:2e7b57c2c146f8e4dadf84977a83f7ee50da17c8861fd7faf694d55e3274784f",
-                "sha256:311f5dc2af07361725033b13cc3d0351de3da8bede3397d45650784c3f21fbcf",
-                "sha256:344e2d920a7f27b4023c087ab539877a1e39ce8e3e90b867e0bfa97829824748",
-                "sha256:3fabde09af43e0cbdee407555383063f8b45bfb52c361bc5da83fcffdb4fd278",
-                "sha256:42b8b8dd0799f78e067e2aaae97e60d58a8f63582939af60abce4c48631a0aa4",
-                "sha256:4b3442249d5e3893b90cb9f72c7d6ce4d2ea144d2c0d9f75b9ae1e5460f3121a",
-                "sha256:55d65f9cc1b733d85ef95ab11f559cce55c7649a2160da2ac7a078534da676c8",
-                "sha256:5c659a0efc80aaaba57fcd878855c8534ecb655a28ac8508885c50648e6e659d",
-                "sha256:72d8a3ffca19a901002d6b068cf746be85747571c6a7ba12cbcf427bfb4ed971",
-                "sha256:75ecc79f26d99222a084fbdd1ce5aad3ac3a8bd535cd9059528452da38b68841",
-                "sha256:76ac9605bf2f6d9b56abf6f9da9047a8782574ad3531c82eae774947ae99cc3f",
-                "sha256:7d276f69bfc8c7ba6c717ba8deaf28f9d3c8450ff0aa8713f5a3280e232be16b",
-                "sha256:7f10f8ba9c1b1430addc7dd385fc322e221559d3ae49b812aebf57470ce8de45",
-                "sha256:8042040af86a494a23c189b5aa0ea9433769cc029707833f261a79c98e3375f9",
-                "sha256:813846738277729d7db71b82176204abc7fdae2f566e2d9fcf874f9b6472e3e6",
-                "sha256:845a14f6deb124a3bcb98a62def067a67462a000e0508f256f9c18eff5847efc",
-                "sha256:869a183c8e44bc03be1b2bbcc9ec4338e37fa8557fc506bf6115887c1d3bb956",
-                "sha256:8acf76443cfb5c949b6e781c154278c059b09ac717d2757a830c869ba000cf8d",
-                "sha256:8f713ea65958ef40049b6c45c40c206ab363db9591ff5a49d89b448933fa5746",
-                "sha256:934115642c8ba9659b402c8bdbdedb48651fb94b576e3b3efd1ccb079609b04a",
-                "sha256:9551f23e09300a9a528f7af20e35c9f79686d46d646152a0c8fc41d2d074d9b0",
-                "sha256:9a2b7543559f8a1c9ed72724b549d8cc3515da7daf3e79813a15bdc4a769de25",
-                "sha256:a55c76254d7cf8d4494bc508e7abb993a82a192d0db4552421e5139235604625",
-                "sha256:ad8f41c2357b73bc9e8606d2fa226233bf4d55d85a8982ecdfd55823a6959995",
-                "sha256:af4868da7dd53296cd7630687161d53a7ebe2e63814234631445697bd7c29f46",
-                "sha256:afebfc3dd3520d37056f641969ce320b071bc7a0800639c71877b90d053e087f",
-                "sha256:b59aa298137ca74a744c1e6e22cfc0bf9dca3a2f41f51bc92eb05695155d905a",
-                "sha256:bc00d1210567a4cdd215ac6e17dc00cb9893ee521cee701adfd0fa43f7c73139",
-                "sha256:c1cb29b1fced01f97e6d5631c3edc2dadb424d1f4421dad079cb13fc97acb42f",
-                "sha256:c94dc64b1a389a416fc4218cd4799aa3756f25940cae33530a4f7f2f54f166da",
-                "sha256:ceaa28a5bce8a46a130cd223e895080e258a88d51bf6e8de2fc54a6ef7e38c34",
-                "sha256:cff6453e25204d3369c47b97dd34783ca820611bd334779d22192da23784194b",
-                "sha256:d0b64409df09edb4c365d95004775c988259efe9be39697d7315c42b7a5e7e94",
-                "sha256:d4813b30cb62d3b63ccc60dd12f2121780c7a3068db692daeb90f989877aaf04",
-                "sha256:da3c55cdc66cfc3fffb607db49a42448785ea2732f055ac1549b69dcb392663b",
-                "sha256:e058c7656c44fb494a11443191e381355388443d543f6fc1a245d5d238544396",
-                "sha256:fed0f22bf1313ff79c7fc318f7199d6c2f96d4de3234b2f12a1eab350e597c06",
-                "sha256:ffd4e4877a78c84d693e491b223385e0271278f5f4e1476a4962dca6824ecfeb"
-            ],
-            "index": "pypi",
-            "version": "==3.17.2"
-        },
-        "six": {
-            "hashes": [
-                "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
-                "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.15.0"
-        },
-        "sklearn": {
-            "hashes": [
-                "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
-            ],
-            "version": "==0.0"
-        },
-        "sqlalchemy": {
-            "hashes": [
-                "sha256:013b659efe02f0f58e7f759602584899c921c178c6a972978f16460dcdd782d5",
-                "sha256:193c3ca465fbc68de071995a461ab535466f041089d372ee6a6f0aae7b9307e6",
-                "sha256:2071ee6cd9390a9527a80ef03458fb58e0166bb299db2c62f9d688b6772d76a1",
-                "sha256:21becd8b45ec70b703239cf915104e47889c2aad96d0f68f597b9b547cbfd787",
-                "sha256:2713b338d9c54d2c3c7ff4f7786a40a5ca85013c8ccea00327b034d42598e22e",
-                "sha256:2a042c27b1a32a87f4cead53bcdd28999324992650896094368a595165b31d97",
-                "sha256:2e65c1146f5b4151cc6e553d9847299c97f53640d94ba88b1c534e15cdc6ac38",
-                "sha256:345c201324066b789804411f07eea750e9f29872be052eba221ce76add647d50",
-                "sha256:360a771b538463053383fb6ff7aceffb595248d7059bb9e003bf70562a66510d",
-                "sha256:432e98e6fe0d24e8181eb4177e59cba9f8831dcaf272a0d2de75bc8b933952a0",
-                "sha256:4387ebd5ae8bc2c716dbfc1ece769c867307eeecc192e72a4d2e7fa0fc092646",
-                "sha256:43fef20dd1024409375cc646a4b5afaffb62f6488e41588cde2a1ed2e9432b5b",
-                "sha256:4d71ee83441826fb48771e58cef51191500a87734b4acb6b698ca018479395bd",
-                "sha256:4eeff8b12c7d22be4de98721bba5a042875f4365e9fd20dc3916eec474ccb81e",
-                "sha256:534c71caa87c7fdb136ce5073fb42b732a4eb390946f503d8e1d7ce6a4a79100",
-                "sha256:66467123c220689d55c6d51fdf88f7b0b62b8078823c5f6c0297ab47c22003d7",
-                "sha256:6c4af3aceeff6a0e2bd3657d8b25714a9f7c7c606e7ec52029284973094f84c1",
-                "sha256:7d252dea33c1ee07b3d702fb4962963996ea40e5a2615dbe7646ccabd851ac76",
-                "sha256:86a7321636f851c6e8009901c5d67e97d82b86ee8c6f28a476691c41c3d71a95",
-                "sha256:88d75ea6b4330a6f5596a49904f21762ff89ca763db065d63b815ad8c3d68952",
-                "sha256:8a296bbf367867aee2ea8d5b391cb04fbdb3ca7277cd1649d9e8114620f3b090",
-                "sha256:933427a5474e014d01bac93224cd4e2bc7bbc7ce531d0bd7e55e4f940cc8ce0d",
-                "sha256:93f6fe67a76d7fa1cca3b9febb36e9f2dd76055230e2bfa317969532f34c03ab",
-                "sha256:a687e552ab4ffedcf3ec3bd5256ab3e753b4f605b467e9fa39690b2dadb5f607",
-                "sha256:a69787f7fc87b84df7e2f27158476cdf39a79ebb95af1d6f696e474724af9ebe",
-                "sha256:a76c10b467f7d385e4cffe2185d975336acf0dbf24ed702c46207df0fb64055e",
-                "sha256:b093bd6efb49332021714bed5752e784a34ae6d6896ec56ffdc32cc83275a215",
-                "sha256:bdeb300bb9adc02f98957cd0cf0c38d641bdd435b0927e39870a772e0a750bc0",
-                "sha256:c719f0058951457a7761bb69c2e47781a9989ab4819b7a30b6b39141ad013a5f",
-                "sha256:cadb58aeadd9916e79e8f99a49d0c0a9e61ae2b24469c2b304a0699e41a25e59",
-                "sha256:cc3c0d87b11ae1dd1ccbd6fc7875a290b3f73b771254180c2e7b19c2aec7379b",
-                "sha256:d42b8e2bffdf9e01d66cf46472b938493b854ea790a0fbe2e2e42624fc253b33",
-                "sha256:d7684e0598acfbfb5110bea482d8c5e94f52001d6d66b5558177f41f49fb5930",
-                "sha256:e5267cd2e51ddefbe10bb182c36ba41cdaa51c83a0fdfa63ed8cbe89cbcf0f33"
-            ],
-            "index": "pypi",
-            "version": "==1.4.6"
-        },
-        "sqlalchemy-utils": {
-            "hashes": [
-                "sha256:fb66e9956e41340011b70b80f898fde6064ec1817af77199ee21ace71d7d6ab0"
-            ],
-            "version": "==0.36.8"
-        },
-        "sqlparse": {
-            "hashes": [
-                "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
-                "sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
-            ],
-            "index": "pypi",
-            "version": "==0.4.1"
-        },
-        "threadpoolctl": {
-            "hashes": [
-                "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725",
-                "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"
-            ],
-            "markers": "python_version >= '3.5'",
-            "version": "==2.1.0"
-        },
-        "tqdm": {
-            "hashes": [
-                "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3",
-                "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==4.60.0"
-        },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
-                "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
-                "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==3.7.4.3"
-        },
-        "urllib3": {
-            "hashes": [
-                "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df",
-                "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"
-            ],
-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
-            "version": "==1.26.4"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
-                "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.4.1"
-        }
-    },
-    "develop": {}
-}

+ 0 - 0
README.md


+ 0 - 0
cdplib/DataExplorer/DataExplorer.py


+ 0 - 0
cdplib/FlattenData.py


+ 0 - 0
cdplib/Singleton_Threadsafe.py


+ 0 - 0
cdplib/__init__.py


+ 20 - 20
cdplib/db_handlers/InfluxdbHandler.py

@@ -82,15 +82,15 @@ class InfluxdbHandler:
         try:
             # result of the query is a defaultdict
             result = self.client.query(query)
-            
+
             if len(list(result.values())) > 0:
 
                 return list(result.values())[0]
-            
+
             else:
-                
+
                 return pd.DataFrame()
-            
+
         except Exception as e:
             self._logger.log_and_raise_error(
                 ("Could not query to dataframe. "
@@ -118,30 +118,30 @@ class InfluxdbHandler:
 
         if (stop is not None) and (not isinstance(stop, str)):
             stop = datetime.strftime(stop, format="%Y-%m-%dT%H:%M:%SZ")
-            
+
         query = 'SELECT ' + columns + ' FROM \"' + tables
-            
+
         if (start is not None) and (stop is not None):
-            
+
              query += '\" WHERE time > \'' +\
                 str(start) +\
                 '\' AND time  < \'' +\
                 str(stop) +\
                 '\' tz(\'Europe/Berlin\');'
-                
+
         elif start is not None:
-            
+
             query += '\" WHERE time >= \'' + str(start) +\
                 '\' tz(\'Europe/Berlin\');'
-            
+
         elif stop is not None:
-            
+
             query += '\" WHERE time <= \'' + str(stop) +\
                 '\' tz(\'Europe/Berlin\');'
-                
+
         else:
             query += ';'
-            
+
 
         return self.query_to_dataframe(query)
 
@@ -150,13 +150,13 @@ class InfluxdbHandler:
                          batch_size: int = 10000,
                          time_precision: str = 'u'):
         """
-        Writes each column of the dataframe which is not 
+        Writes each column of the dataframe which is not
         in tag_columns as a separate measurement to the database.
-        
+
         Tag columns are put as tags to each measurement.
-        
+
         The dataframe has to have a datatime index!
-        
+
         :param dataframe: dataframe to write to the database
         :type dataframe: pd.DataFrame
         :param tag_columns: column names to be used as tags
@@ -166,10 +166,10 @@ class InfluxdbHandler:
         :param time_precision:
         :type tiime_precision: str
         """
-        
+
         measurement_columns = [c for c in dataframe.columns
                                if c not in (tag_columns or [])]
-        
+
         for column in measurement_columns:
             try:
                 self.client.write_points(
@@ -187,4 +187,4 @@ class InfluxdbHandler:
 
 if __name__ == "__main__":
 
-    influx_handler = InfluxdbHandler()
+    influx_handler = InfluxdbHandler()

+ 0 - 0
cdplib/db_handlers/MongodbHandler.py


+ 1 - 2
cdplib/db_handlers/SQLHandler.py

@@ -508,7 +508,6 @@ class SQLHandler:
         :rtype: DataFrame
         '''
         try:
-            
             connection = self._engine.connect()
 
             data = pd.read_sql(sql=query,
@@ -516,7 +515,7 @@ class SQLHandler:
                                **read_sql_kwargs)
 
             connection.close()
-           
+
             return data
 
         except Exception as e:

+ 2 - 1
cdplib/db_handlers/__init__.py

@@ -1,2 +1,3 @@
 from .MongodbHandler import *
-from .SQLHandler import *
+from .SQLHandler import *
+from .InfluxdbHandler import *

+ 0 - 0
cdplib/db_migration/DataFrameToCollection.py


+ 27 - 25
cdplib/db_migration/MigrationCleaning.py

@@ -255,9 +255,11 @@ class MigrationCleaning:
             columns = db.get_column_names(tablename=self._inconsist_report_table)
 
             if len(columns) > 0:
-                columns_not_in_data = [column for column in columns if column not in data.columns]
-                for value in columns_not_in_data:
-                    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
+                # TODO Tanya:The commented lines caused the reason to be the same for all entries.
+
+                #columns_not_in_data = [column for column in columns if column not in data.columns]
+                #for value in columns_not_in_data:
+                #    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
                 data_inconsist = data_inconsist[columns]
 
         db.append_to_table(data=data_inconsist,
@@ -396,7 +398,7 @@ class MigrationCleaning:
                     data[column] = data[column].astype(python_type)
 
                 elif python_type == float:
-                    
+
                     data[column] = data[column].fillna(np.inf)
                     # Replaces empty fields when type is string
                     if data[column].dtypes == object:
@@ -564,15 +566,15 @@ class MigrationCleaning:
         return data
 
     def clean_json_from_None_object(self, data: pd.DataFrame, clean_bool: bool = True) -> pd.DataFrame():
-        
+
         data = data.to_json(date_format="iso")
         data = json.loads(data)
         new_data = remap(data, lambda p, k, v: v is not None)
         new_data = remap(new_data, lambda p, k, v: v != 'None')
         new_data = remap(new_data, lambda p, k, v: v != 'inf')
-        # cleans not only bool type also int which are 0 or 1 
+        # cleans not only bool type also int which are 0 or 1
         # only use if it is necessary have to be change that it only considers
-        # Ture and False for bools 
+        # Ture and False for bools
         if clean_bool:
             new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
         return new_data
@@ -588,27 +590,27 @@ class MigrationCleaning:
 
 
     def map_toleranzen_values(self, data: pd.DataFrame, toleranzen: pd.DataFrame):
-        
+
         toleranzen.drop('nr', axis=1, inplace=True)
-        
+
         toleranzen.columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.geometrie.durchmesser.min', 'wellenschenkel.geometrie.durchmesser.max', 'innenring.geometrie.durchmesser.min',
                         'innenring.geometrie.durchmesser.max', 'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
 
-        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max', 
+        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max',
                                     'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
-        
+
         labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.geometrie.durchmesser.min', 'labyrinthring.geometrie.durchmesser.max']
-        
+
         reparatur_stufe_labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.reparatur_stufe.durchmesser.min', 'labyrinthring.reparatur_stufe.durchmesser.max']
 
-        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min', 
+        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min',
                                     'wellenschenkel.reparatur_stufe.durchmesser.max', 'innenring.reparatur_stufe.durchmesser.min',
-                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min', 
+                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min',
                                     'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.max']
 
-        
+
         toleranzen_reference_columns = ['wellenschenkel_toleranz', 'labyrinthring_toleranz', 'wellen_reparatur_stufe_toleranz', 'labyrinthring_reparatur_stufe_toleranz']
-        
+
         available_columns = [column for column in data.columns if column in toleranzen_reference_columns]
         for column in available_columns:
             merge_map = [False] *len(data.index)
@@ -623,13 +625,13 @@ class MigrationCleaning:
 
                     else:
                         temp_toleranzen.columns = labyrinten_columns
-                
+
                 elif 'reparatur_stufe' in column:
                     temp_toleranzen.columns = reparatur_stufe_columns
                     merge_map = data['innenring_reparatur_stufe_zulaessig'] == 'Ja'
                 data_before = len(data.index)
                 data = data.merge(temp_toleranzen, how='left', left_on=column, right_on='toleranzbez_wellen_reference')
-                data.loc[merge_map, temp_toleranzen.columns] = np.nan 
+                data.loc[merge_map, temp_toleranzen.columns] = np.nan
                 if data_before != len(data.index):
                     print('WEVE LOST DATA!!')
                     print('before:', data_before, 'now:', len(data.index))
@@ -641,9 +643,9 @@ class MigrationCleaning:
 
     def label_is_level(
                     self,
-                    data: pd.DataFrame, 
-                    column: str = "is", 
-                    include_schrott: bool = False, 
+                    data: pd.DataFrame,
+                    column: str = "is",
+                    include_schrott: bool = False,
                     drop_rows_with_no_is: bool = False) -> pd.DataFrame:
         '''
         '''
@@ -659,16 +661,16 @@ class MigrationCleaning:
                 data.loc[data[column].isin(v), column] = k
             else:
                 data.loc[data[column].isnull(), column] = k
-        
+
         if include_schrott and ("operation_type_2" in data.columns):
             schrott_mask = (data["operation_type_2"] == 2)
             data.loc[schrott_mask, column] = 5
-        
+
         data.loc[~data[column].isin([0,1,2,3,4,5]), column] = 0
-                    
+
         if drop_rows_with_no_is:
             data = data.loc[data[column] != 0].copy(deep=True)
-            
+
         return data.reset_index(drop=True)
 
 

+ 0 - 0
cdplib/db_migration/ParseDbSchema.py


+ 0 - 0
cdplib/db_migration/ParseJsonSchema.py


+ 0 - 0
cdplib/db_migration/ParseMapping.py


+ 270 - 0
cdplib/feature_engineering/StatisticalFeatures.py

@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+""" 
+Created on Tue Oct 16 16:08:47 2018
+
+@author: tanya
+"""
+import types
+import logging
+import pandas as pd
+
+from collections import defaultdict
+from functools import reduce
+
+from libraries.logging.logging_utils import configure_logging
+from libraries.exception_handling import InputChecks
+          
+class StatisticalFeatures:
+    '''
+    Groups data by index columns and returns aggregated statistics for given columns
+    
+    :param list of tuples or dict index_cols: 
+        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
+                                             (colname_2, aggfunc_3)]
+        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
+        where colname_i is column to aggregate and aggfunc_i are either 
+        function variables or strings accepted by pandas for built-in function names.
+        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
+        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
+        then only the first one is kept.
+        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
+        
+    '''
+    def __init__(self, data, index_cols, path_to_log = None):
+        '''
+        '''
+        configure_logging(path_to_log)
+            
+        self.logger = logging.getLogger(__name__)
+        
+        self.checks = InputChecks(logger = self.logger)
+        
+        self.data = data
+        
+        self.checks.assert_correct_type({'data', [pd.DataFrame]})
+            
+        self.index_cols = index_cols
+        
+        # make warning about missing values in index columns
+        for col in self.index_cols:
+            if data[col].isnull().any():
+                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
+
+        
+    def get_kpis_by_aggregation(self, kpis):
+        '''
+        Aggregates given fields with given aggregation functions
+         USE CASE: per product find mean and standard variation of a price
+        
+        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
+         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
+         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
+         
+        :return: features with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        def get_valid_agg_dict_from_kpis(kpis):
+            '''
+            Filters inputs of incorrect shape or type,
+            Filters out columns not present in data
+            Removes multiple functions with the same name
+            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
+            Reports to the log
+            :param list or dict kpis:
+            '''
+            def get_name(x):
+                '''
+                Returns function name for function and does nothing for string
+                '''
+                if isinstance(x, types.FunctionType):
+                    return x.__name__
+                else:
+                    return x
+                
+            def passed_first_line_type_control(col, aggfunc):
+                '''
+                Checks if aggregation works on the first 2 lines of the data
+                '''
+                try:
+                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
+                    self.data.iloc[:2]\
+                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
+                             .groupby(self.index_cols)\
+                             .agg({col : aggfunc})
+                    return True
+                except Exception as e:
+                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
+                    return False
+           
+            
+            
+            valid_kpi_dict = defaultdict(list)
+            
+            if isinstance(kpis, list):
+                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
+                if sum(incorrect_lengths) > 0:
+                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
+                
+                cols = list(zip(*kpis))[0]             
+                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
+            elif isinstance(kpis, dict):
+                cols = list(kpis.keys())
+                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
+                
+            cols_not_in_data = set(cols) - set(self.data.columns)
+            if len(cols_not_in_data) > 0:
+                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
+                
+            for col, aggfuncs in kpis:
+                if not isinstance(aggfuncs, list):
+                    aggfuncs = [aggfuncs]
+                
+                for aggfunc in aggfuncs:
+                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
+                    if not is_new_funcname:
+                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
+                    
+                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
+                        valid_kpi_dict[col].append(aggfunc)
+                    
+            return valid_kpi_dict
+                   
+        
+        
+        
+        agg_dict = get_valid_agg_dict_from_kpis(kpis)
+        
+        if len(agg_dict) > 0:
+        
+            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
+                             else '_'.join([col, str(aggfunc)]) 
+                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
+            
+            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
+            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
+                       .groupby(self.index_cols)\
+                       .agg(agg_dict)\
+                       .set_axis(new_names, axis = 'columns', inplace = False)\
+                       .reset_index()
+        else:
+            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
+        
+        
+        
+        
+        
+        
+        
+    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
+        '''
+        A wrapper crosstab method with index equal to index_cols
+        USE CASE: per product find standart variation of the price in each city
+        
+        :param str pivot_col: column values of which become columns in the output
+        :param str value_col: column name to fillin vlaues
+        :param str or func aggfunc: count if None
+        :param list entries: values of pivot_col to show
+        :return: table with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        
+        # assert that types of the inputs are correct
+        types_to_check = {'columns' : [str], 
+                          'value_col' : [str, type(None)],  
+                          'aggfunc' : ['str', types.FunctionType, type(None)], 
+                          'entries' : [list, type(None)]}
+        
+        self.checks.assert_correct_type(types_to_check)
+        
+        cols_to_check = [pivot_col]
+        if not value_col is None:
+            cols_to_check.append(value_col)
+        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
+
+        if not entries is None:
+            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
+        else:
+            entry_filter = pd.Series([True]*len(self.data))              
+    
+        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
+        columns = self.data.loc[entry_filter, pivot_col]
+        if not value_col is None:
+            value_col = self.data.loc[entry_filter, value_col]
+                        
+        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
+        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
+                       .reset_index()
+        return result
+    
+
+
+
+
+        
+    
+    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
+        '''
+        Aggregates values obtained with method get_value_stats
+         USE CASE: per product find average variation of the price over all cities
+         
+        :param str pivot_col:
+        :param str value_col:
+        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
+        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
+        :param list entries: 
+        :return: table with index- and kpi- columns
+        :rtype: pandas DataFrame
+        '''
+        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
+        
+        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
+
+        result = value_stat_kpis[self.index_cols].copy(deep = True)
+        
+        for aggfunc in aggfuncs_step2:
+            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
+            
+            if isinstance(aggfunc, str):
+                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
+            else:
+                result[colname] = value_stat_kpis.set_index(self.index_cols)\
+                                                 .apply(aggfunc, axis = 1)\
+                                                 .reset_index(drop = True)
+                                                 
+        return result
+                              
+                              
+                              
+                              
+                                                            
+    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
+        '''
+        Finds argmin or argmax of a column
+         USE CASE: per product find the city with maximum variation of the price
+        
+        :param str min_or_max: must be in ['min', 'max']
+        :param str pivot_col:
+        :param str value_col:
+        :param str aggfunc:    
+        '''
+        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
+        
+        if min_or_max == 'max':
+            aggfuncs_step2 = ['idxmax']
+        else:
+            aggfuncs_step2 = ['idxmin']
+            
+        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
+                                                   value_col = value_col, 
+                                                   aggfunc_step1 = aggfunc, 
+                                                   aggfucs_step2 = aggfuncs_step2)
+        
+        
+        
+        
+    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
+    
+    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
+    
+    # write tests for all methods

+ 77 - 0
cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py

@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  7 15:11:21 2018
+
+@author: tanya
+"""
+
+import pandas as pd
+
+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
+
+
+class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
+    '''
+    '''
+    
+    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
+        '''
+        '''
+        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
+                                                                   index_cols = index_cols,
+                                                                   date_col = date_col,
+                                                                   split_date = split_date,
+                                                                   period_length = n_periods*period_length,
+                                                                   past_or_future = past_or_future,
+                                                                   freq = freq,
+                                                                   path_to_log)
+        
+        self.period_number_col = 'period_number'
+        while period_number_col in data.columns:
+            self.period_number_col += '&'
+        
+        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
+                            .groupby(index_cols)[date_col].cumcount()\
+                            .reset_index()\
+                            .assign(period_number = lambda x: x[0]/period_length)\
+                            .rename(columns = {'period_number' : self.period_number_col})
+                                       
+                
+        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
+                            
+        self.initial_index_cols = self.index_cols.copy()
+        self.index_cols.append(self.period_number_col)
+        
+        
+    def _aggregate_over_time_periods(df):
+        '''
+        '''
+        return df.drop(self.period_number_col, axis = 1)\
+                 .groupby(self.initial_index_cols)\
+                 .mean()\
+                 .reset_index()
+        
+        
+    def get_kpis_by_aggregation(self, **args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                      .get_kpis_by_aggregation(**args))
+            
+            
+    def get_value_stats(self, **args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                 .get_value_stats(**args))
+        
+        
+    def get_aggregated_value_stats(self, args):
+        '''
+        '''
+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
+                                                 .get_aggregated_value_stats(**args))
+        
+    
+        

+ 53 - 0
cdplib/feature_engineering/StatisticalFeaturesOverTime.py

@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Nov  7 14:02:18 2018
+
+@author: tanya
+"""
+
+import logging
+import pandas as pd
+
+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
+from libraries.exception_handling import InputChecks, InputCasts
+from libraries.logging.logging_utils import configure_logging
+
+class StatisticalFeaturesOverTime(StatisticalFeatures):
+    '''
+    '''
+    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
+        '''
+        '''
+        configure_logging(path_to_log)
+        self.logger = logging.getLogger(__name__)
+        self.checks = InputChecks(logger = self.logger)
+        self.casts = InputCasts(logger = self.logger)
+        
+        self.checks.assert_column_presence(data = data, colnames = [date_col])
+        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
+        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
+        
+        
+        if past_or_future == 'past':
+            if not period_length is None:
+                min_date = split_date - pd.DateOffset(**{freq : period_length})
+            else:
+                min_date = data[date_col].min()
+            sup_date = split_date
+        else:
+            min_date = split_date
+            if not period_length is None:
+                sup_date = split_date + pd.DateOffset(**{freq : period_length})
+            else: 
+                sup_date = split_date + pd.DateOffset(**{freq : 1})
+            
+        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
+        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
+        
+            
+        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
+        
+        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
+                                                    index_cols = index_cols,
+                                                    path_to_log = path_to_log)

+ 173 - 0
cdplib/fine_tuning/FineTunedClassiferCV.py

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Apr 23 08:51:53 2020
+
+@author: tanya
+
+@description: class for fine-tuning a sklearn classifier
+(optimizing the probability threshold)
+"""
+
+import pandas as pd
+import numpy as np
+
+from typing import Callable
+
+from sklearn.base import (BaseEstimator, ClassifierMixin,
+                          clone, MetaEstimatorMixin)
+
+from cdplib.log import Log
+
+from cdplib.utils.TyperConverter import TypeConverter
+
+
+class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
+                            MetaEstimatorMixin):
+    """
+    Probability threshold tuning for a given estimator.
+    Overrides the method predict of the given sklearn classifer
+    and returns predictions with the optimal value of
+    the probability threshold.
+
+    An object of this class can be passed to an sklearn Pipeline
+    """
+    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
+                 cv=None, threshold_step: float = 0.1):
+        """
+        """
+        self.estimator = estimator
+
+        self.is_fitted = False
+
+        self.greater_is_better = greater_is_better
+
+        if cv is None:
+            self.cv = ...
+        else:
+            self.cv = cv
+
+        self.cost_func = cost_func
+
+        self.threshold_step = threshold_step
+
+        self.optimal_threshold = 0.5
+
+        self._logger = Log("FineTunedClassifyCV")
+
+    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
+                            proba_pred: (pd.DataFrame, np.array)):
+        '''
+        '''
+        costs = {}
+
+        for t in np.arange(self.threshold_step, 1, self.threshold_step):
+            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
+
+        if self.greater_is_better:
+            return max(costs, key=costs.get)
+        else:
+            return min(costs, key=costs.get)
+
+    def fit(self, X: (pd.DataFrame, np.array),
+            y: (pd.DataFrame, np.array) = None,
+            **fit_args):
+        """
+        """
+        X = TypeConverter().convert_to_ndarray(X)
+        if y is not None:
+            y = TypeConverter().convert_to_ndarray(X)
+
+        optimal_thrs_per_fold = []
+
+        for train_inds, val_inds in self.cv:
+            X_train, X_val = X[train_inds], X[val_inds]
+
+            if y is not None:
+                y_train, y_val = y[train_inds], y[val_inds]
+            else:
+                y_train, y_val = None, None
+
+            estimator = clone(fine_tuned_clf.estimator)
+
+            estimator.fit(X_train, y_train, **fit_args)
+
+            proba_pred = estimator.predict_proba(X_val)
+
+            optimal_thr = self._get_best_threshold(y_val, proba_pred)
+
+            optimal_thrs_per_fold.append(optimal_thr)
+
+        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
+
+        self.estimator.fit(X, **fit_args)
+
+    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
+        """
+        """
+        if self.is_fitted:
+
+            proba_pred = self.estimator.predict_proba(X)
+
+            return (proba_pred >= self.optimal_threshold).astype(int)
+
+        else:
+            self._logger.warn("You should fit first")
+
+    def get_params(self):
+        """
+        """
+        params = self.estimator.get_params()
+
+        params.update({"cv": self.cv, "cost_func": self.cost_func})
+
+        return params
+
+    def set_params(self, **params: dict):
+        """
+        """
+        for param in params:
+            if param == "cv":
+                self.cv = params[param]
+                params.pop(param)
+
+            elif param == "cost_func":
+                self.cost_func = params[param]
+                params.pop(param)
+
+        self.estimator.set_params(**params)
+
+
+if __name__ == "__main__":
+    # test
+    from sklearn.datasets import load_iris
+    from sklearn.metrics import accuracy_score
+    import gc
+    from xgboost import XGBRFClassifier
+
+    data = load_iris()
+    X, y = data["data"], data["target"]
+    y = (y==1).astype(int)
+    del data
+    gc.collect()
+
+    # make a custom cv object
+    val_len = len(X)//10
+    split_inds = range(len(X)//2, len(X), val_len)
+
+    cv = []
+
+    for i in split_inds:
+        train_inds = list(range(i))
+        val_inds = list(range(i, i + val_len))
+        cv.append((train_inds, val_inds))
+
+    clf = XGBRFClassifier()
+
+    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
+                                           cv=cv,
+                                           greater_is_better=True,
+                                           cost_func=accuracy_score)
+
+    fine_tuned_clf.fit(X=X, y=y)
+

+ 375 - 0
cdplib/gridsearch/GridSearchPipelineSelector.py

@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 14:15:17 2020
+
+@author: tanya
+@description:a class for selecting a machine learning
+ pipeline from a deterministic space of parameter distributions
+ over multiple pipelines.
+ The selection is though in such a way that a Trials object is being
+ maintained during the tuning process from which one can retrieve
+ the best pipeline so far as well as the entire tuning history
+ if needed.
+"""
+
+import os
+import datetime
+import numpy as np
+from itertools import product
+from collections import ChainMap
+from sklearn.pipeline import Pipeline
+from typing import Callable, Optional, Literal, Dict, Union, List
+
+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
+
+
+class GridSearchPipelineSelector(PipelineSelector):
+    """
+    A class for selecting a machine learning
+     pipeline from a deterministic space of parameter distributions
+     over multiple pipelines.
+     The selection is though in such a way that a Trials object is being
+     maintained during the tuning process from which one can retrieve
+     the best pipeline so far as well as the entire tuning history
+     if needed.
+    """
+    def __init__(self,
+                 cost_func: Union[Callable, str],
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: Optional[int] = None,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"
+                 ):
+        """
+        ::param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from scratch.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
+
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to save
+            of the form {"metric_name": metric} where metric is a Callable.
+
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+        try:
+
+            super().__init__(cost_func=cost_func,
+                             greater_is_better=greater_is_better,
+                             trials_path=trials_path,
+                             backup_trials_freq=backup_trials_freq,
+                             cross_val_averaging_func=cross_val_averaging_func,
+                             additional_metrics=additional_metrics,
+                             strategy_name=strategy_name,
+                             stdout_log_level=stdout_log_level)
+
+            self._logger = Log("GridsearchPipelineSelector: ",
+                               stdout_log_level=stdout_log_level)
+
+            self._trials = self._trials or []
+
+        except Exception as e:
+            err = "Failed initialization. Exit with error: {}".format(e)
+
+            self._logger.log_and_raise_error(err)
+
+    def run_trials(self) -> None:
+        """
+        """
+        try:
+            assert(self.attached_space),\
+                "Parameter distribution space must be attached"
+
+            done_trial_ids = [{"name": trial["name"],
+                               "params": trial["params"],
+                               "status": trial["status"]}
+                              for trial in self._trials]
+
+            # list (generator) of (flattened) dictionaries
+            # with all different combinations of
+            # parameters for different pipelines
+            # from the space definition.
+            space_unfolded = ({"name": param_dist["name"],
+                               "pipeline": param_dist["pipeline"],
+                               "params": param_set}
+                              for param_dist in self._space
+                              for param_set in
+                              (dict(ChainMap(*tup)) for tup in
+                               product(*[[{k: v} for v in
+                                          param_dist["params"][k]]
+                                         for k in param_dist["params"]])))
+
+            for space_element in space_unfolded:
+
+                # uniquely identifies the current space element
+                trial_id = {"name": space_element["name"],
+                            "params": space_element["params"],
+                            "status": 'ok'}
+
+                # verify if the current pipline/parameters
+                # were already tested before
+                if trial_id in done_trial_ids:
+                    continue
+
+                result = self._objective(space_element)
+
+                pipeline = space_element["pipeline"].set_params(
+                        **space_element["params"])
+
+                trial = {"name": space_element["name"],
+                         "params": space_element["params"],
+                         "pipeline": pipeline}
+
+                trial.update(result)
+
+                self._trials.append(trial)
+
+            self.finished_tuning = True
+
+            self.total_tuning_time = datetime.datetime.today()\
+                - self.start_tuning_time
+
+            self._backup_trials()
+
+        except Exception as e:
+            err = "Failed to run trials. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def number_of_trials(self) -> Union[int, None]:
+        """
+        Number of trials already run in the current trials object
+        """
+        try:
+            return len(self._trials)
+
+        except Exception as e:
+            err = ("Failed to retrieve the number of trials. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial(self) -> Union[dict, None]:
+        """
+        """
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return max(self._trials, key=lambda x: x["score"])
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score(self) -> Union[float, None]:
+        '''
+        '''
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return self.best_trial["score"]
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score_variance(self) -> Union[float, None]:
+        '''
+        '''
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return self.best_trial["score_variance"]
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
+        '''
+        '''
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return self.best_trial["pipeline"]
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines(self, n: int)\
+            -> Union[List[Pipeline], None]:
+        """
+        N best pipelines with corresponding
+        best hyperparameters
+        """
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return [trial["pipeline"] for trial in
+                    sorted(self._trials, key=lambda x: x["score"],
+                           reverse=True)[:n]]
+
+        except Exception as e:
+            err = ("Failed to retrieve n best trials. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
+            -> Union[Dict[str, List[Pipeline]], None]:
+        """
+        If the hyperparameter search is done over multiple
+        pipelines, then returns n different pipeline-types
+        with corresponding hyperparameters
+        """
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return pd.DataFrame(self._trials)\
+                     .sort_values(by=["name", "score"],
+                                  ascending=False)\
+                     .groupby("name")\
+                     .head(n)\
+                     .groupby("name")["pipeline"]\
+                     .apply(lambda x: list(x))\
+                     .to_dict()
+
+        except Exception as e:
+            err = ("Failed to retrieve n best trials of each type."
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def trials_to_excel(self, path: str) -> None:
+        """
+        Trials object in the shape of table written to excel,
+        should contain the run number, pipeline (as str),
+        hyperparamters (as str), self.best_result (see self._objective method)
+        as well as additional information configured
+        through self.save_result method.
+        """
+        try:
+            pd.DataFrame(self._trials).to_excel(path)
+
+        except Exception as e:
+            err = ("Failed to write trials to excel. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+
+if __name__ == "__main__":
+
+    # elementary example
+
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.metrics import accuracy_score, precision_score
+    from cdplib.gridsearch.space_sample import space
+    from cdplib.log import Log
+    from cdplib.db_handlers import MongodbHandler
+    import pickle
+    import pandas as pd
+    import os
+
+    trials_path = "gridsearch_trials_TEST.pkl"
+    additional_metrics = {"precision": precision_score}
+    strategy_name = "strategy_1"
+    data_path = "data_TEST.h5"
+    cv_path = "cv_TEST.pkl"
+    collection_name = 'TEST_' + strategy_name
+
+    logger = Log("GridSearchPipelineSelector__TEST:")
+
+    logger.info("Start test")
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
+    pd.Series(y).to_hdf(data_path, key="y_train")
+
+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
+
+    pickle.dump(cv, open(cv_path, "wb"))
+
+    gs = GridSearchPipelineSelector(cost_func=accuracy_score,
+                                    greater_is_better=True,
+                                    trials_path=trials_path,
+                                    additional_metrics=additional_metrics,
+                                    strategy_name=strategy_name,
+                                    stdout_log_level="WARNING")
+
+    gs.attach_space(space=space)
+
+    gs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
+                             cv_pickle_path=cv_path)
+
+    save_method = MongodbHandler().insert_data_into_collection
+    save_kwargs = {'collection_name': collection_name}
+
+    gs.configer_summary_saving(save_method=save_method,
+                               kwargs=save_kwargs)
+
+    gs.run_trials()
+
+    logger.info("Best trial: {}".format(gs.best_trial))
+    logger.info("Total tuning time: {}".format(gs.total_tuning_time))
+
+    for file in [trials_path, data_path, cv_path]:
+        os.remove(file)
+
+    logger.info("End test")

+ 33 - 0
cdplib/gridsearch/space_sample.py

@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct  5 09:50:24 2020
+
+@author: tanya
+"""
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+
+space = [
+        {"name": "std_scaler_kbest_rf",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("kbest", SelectPercentile()),
+                 ("rf", RandomForestClassifier())]),
+         "params": {"kbest__percentile": [2, 3],
+                    "rf__n_estimators": [10, 20]}},
+
+        {"name": "std_scaler_pca_lr",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("pca", PCA()),
+                 ("lr", LogisticRegression())]),
+         "params": {"lr__C": [0.5, 1],
+                    "pca__n_components": [2, 3]}}
+        ]

+ 0 - 798
cdplib/hyperopt/HyperoptPipelineSelection.py

@@ -1,798 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-"""
-Created on Fri Nov  9 13:27:44 2018
-
-@author: tanja
-@description: Implementation of machine learning
-                pipeline selection and tuning with hyperopt library
-"""
-
-import os
-import sys
-import gc
-import logging
-import pickle
-import time
-import datetime
-
-import pandas as pd
-import numpy as np
-
-from sklearn.pipeline import Pipeline
-
-from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
-    space_eval, pyll
-
-from sklearn.model_selection import cross_validate
-
-
-class HyperoptPipelineSelection:
-    '''
-    Use this class to perform a search
-    for a machine learning pipeline in a given parameter space.
-    The parameter space can include multiple types of Pipelines
-    (SVM, XGBOOST, random forest, etc),
-    as well as parameter distributions for each pipeline parameter.
-    See example in main for the expected space structure.
-
-    The search can be performed either randomly
-    or with a tree-based algorithm. (Other methods are currently
-    developped by hyperopt creators).
-
-    Attribute trials is responsible for book-keeping parameter
-    combinations that have already been tried out. This attribute
-    is saved to a binary file every n minutes as well as every time
-    a better pipeline was found.
-    '''
-    def __init__(self,
-                 cost_func,
-                 greater_is_better: bool,
-                 trials_path: str,
-                 backup_trials_freq: int = 1,
-                 log_path: str = None,
-                 averaging_func: callable = None):
-        '''
-        :param callable cost_func: function to minimize or maximize
-
-        :param bool greater_is_better: when True
-            cost_func is maximized, else minimized.
-
-        :param str trials_path: path at which the trials object is saved
-            in binary format. From the trials object we can
-            select information about the obtained scores, score variations,
-            and pipelines, and parameters tried out so far. If a trials object
-            already exists at the given path, it is loaded and the
-            search is continued, else, the search is started from
-            the beginning.
-
-        :param backup_trials_freq: frequecy in interations (trials)
-            of saving the trials object at the trials_path.
-
-        :param str log_path: Optional, when not provided logs to stdout.
-
-        :param callable averaging_func: optional,
-            when not provided set to mean. Function
-            to aggregate the cross-validated values of the cost function.
-            Classic situation is to take the mean,
-            another example is, for example mean() - c*var().
-        '''
-
-        assert(callable(cost_func)),\
-            "Parameter 'cost_func' must be a callable"
-
-        assert(isinstance(greater_is_better, bool)),\
-            "Parameter 'greater_is_better' must be bool type"
-
-        assert(isinstance(trials_path, str)),\
-            "Parameter 'trials_path' must be of string type"
-
-        if averaging_func is not None:
-            assert(callable(averaging_func)),\
-                "Parameter 'averaging_func' must be a callable"
-
-        self._assert_valid_directory(path=trials_path)
-
-        self._configer_logger(log_path)
-
-        self._cost_func = cost_func
-        # is 1 when cost_func is minimized, -1 when cost func is maximized
-        self._score_factor = (not greater_is_better) - greater_is_better
-        self._trials_path = trials_path
-        # is initialized with empty trials object
-        self._trials = Trials()
-        self._backup_trials_freq = backup_trials_freq
-        self._averaging_func = averaging_func or np.mean
-        # keeping track of the current search iteration
-        self._run_number = 0
-        # space and data need to be attached to perform search.
-        self._space_attached = False
-        self._data_attached = False
-
-        # if a trials object already exists at the given path,
-        # it is loaded and the search is continued. Else,
-        # the search is started from the beginning.
-        if os.path.isfile(trials_path):
-            try:
-                with open(trials_path, "rb") as f:
-                    self._trials = pickle.load(f)
-
-                self._logger.info(("Loaded an existing trials object"
-                                   "Consisting of {} trials")
-                                  .format(len(self._trials.trials)))
-
-            except Exception as e:
-                self._logger.error(("Trials object could not be loaded. "
-                                    "Training starts from the beginning. "
-                                    "Exit with error {}").format(e))
-
-        else:
-            self._logger.info(("No existing trials object was found"
-                               "Initialized an empty trials object."))
-
-        self._best_score = self.best_trial_score
-
-    def _configer_logger(self, log_path: str = None):
-        '''
-        Can be replaced with the existing script later.
-        When log_path is not provided, logs to stdout.
-        '''
-
-        self._logger = logging.getLogger(__name__)
-
-        if (self._logger.hasHandlers()):
-            self._logger.handlers.clear()
-
-        if log_path is not None:
-            assert(isinstance(log_path, str)),\
-                "Parameter 'log_path' must be of string type"
-            self._assert_valid_directory(log_path)
-
-            handler = logging.FileHandler(log_path)
-        else:
-            handler = logging.StreamHandler(sys.stdout)
-
-        formatter = logging.Formatter(
-                '\n %(asctime)s %(levelname)s %(message)s')
-
-        handler.setFormatter(formatter)
-        self._logger.addHandler(handler)
-        self._logger.setLevel("INFO")
-
-    def _backup_trials(self):
-        '''
-        Pickles (Saves) the trials object.
-        Used in a scheduler.
-        '''
-        with open(self._trials_path, "wb") as f:
-            pickle.dump(self._trials, f)
-
-    def _assert_valid_directory(self, path: str):
-        '''
-        If the directory of a path does not exist yet,
-        creates it.
-        '''
-        assert(isinstance(path, str)),\
-            "Parameter 'path' must of str type"
-
-        dirname = os.path.dirname("path")
-
-        if len(dirname) > 0:
-            os.mkdir(dirname, exists_ok=True)
-
-    def attach_space(self, space: pyll.base.Apply = None,
-                     module_path: str = None,
-                     name: str = None):
-        '''
-        :param pyll.base.Apply space: hyperopt space where
-            the search is performed. Optional when a space
-            is loaded from a python module.
-
-        :param str module_path: path to python module
-            where the space is defined. Optional when
-            the space is provided directly.
-
-        :param str name: name of the space loaded from
-            a python module. Optional when the space
-            is provided directly.
-        '''
-        assert((space is not None) or
-               ((module_path is not None) and (name is not None))),\
-            "Either space or (module_path, name) must be provided"
-
-        if space is None:
-            for p in ["modele_path", "name"]:
-                assert(isinstance(p, str)),\
-                    "Parameter '{}' must be of str type".format(p)
-
-            assert(os.path.isfile(module_path)),\
-                "Parameter 'module_path' must be a valid file"
-
-            module, extension = os.path.splitext(os.path.basename(module_path))
-            assert(extension == ",py"),\
-                "Parameter 'space' must be read from a python file"
-
-            sys.path.insert(module_path)
-
-            try:
-                from module import name as space
-            except ImportError:
-                err = "Invalid space location or name"
-                self._logger.error(err)
-                raise Exception(err)
-
-        assert(isinstance(space, pyll.base.Apply)),\
-            "Parameter 'space' must be of hyperopt space type"
-
-        self._space = space
-        self._logger.info("Attached parameter distribution space")
-        self._space_attached = True
-
-    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
-            -> np.ndarray:
-        '''
-        Converts an DataFrame to an numpy array.
-        '''
-        if isinstance(x, np.ndarray):
-            return x
-
-        elif (isinstance(x, pd.core.frame.DataFrame))\
-                or (isinstance(x, pd.core.series.Series)):
-            return x.values
-
-        else:
-            e = 'The argument must be a numpy array or a pandas DataFrame'
-            self._logger.critical(e)
-            raise ValueError(e)
-
-    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
-                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    X_val: (pd.DataFrame, np.ndarray) = None,
-                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    cv: (list, int) = None):
-        '''
-        :param array X_train: data on which
-            machine learning pipelines are trained
-
-        :param array y_train: optional, vector with targets,
-            (not all algorithms require a targets)
-
-        :param array X_val: optional, validation data.
-            When not provided, cross-validated value
-            of the cost_func is calculated.
-
-        :param array y_val: optional, validation targets
-
-        :param list cv: list of tuples containing
-            train and validation indices or an integer representing
-            the number of folds for a random split of data
-            during cross-validation
-            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
-        '''
-
-        X_train = self._convert_to_array(X_train)
-        if y_train is not None:
-            y_train = self._convert_to_array(y_train)
-
-        if X_val is not None:
-            if cv is not None:
-                self._logger.warning(("Both validation set and cv object "
-                                      "are set. Validation score will be "
-                                      "calculated on the validation set!"))
-
-            X_val = self._convert_to_array(X_val)
-
-            train_inds = list(range(len(X_train)))
-            val_inds = list(range(len(X_train),
-                                  len(X_train) + len(X_val)))
-
-            # cost is evaluated with a cross validation function
-            # that accepts an array and a cv object with
-            # indices of the fold splits.
-            # Here we create a trivial cv object
-            # with one validation split.
-            self._cv = [(train_inds, val_inds)]
-            self._X = np.concatenate([X_train, X_val])
-
-            if y_train is not None:
-                if y_val is None:
-                    err = "Argument y_val must be provided"
-                    self._logger.critical(err)
-                    raise ValueError(err)
-                else:
-                    y_val = self._convert_to_array(y_val)
-                    self._y = np.concatenate([y_train, y_val])
-            else:
-                self._y = None
-        else:
-            if cv is None:
-                self._logger.warning(("Neither validation set nor cv object "
-                                      "are set. Validation score will be "
-                                      "calculated on 5 randomly "
-                                      "splitted folds."))
-
-            self._X = X_train
-            self._y = y_train
-            self._cv = cv
-
-        self._logger.info("Attached data")
-        self._data_attached = True
-
-    def _evaluate(self, pipeline: Pipeline) -> dict:
-        '''
-        This method is called in _objective.
-
-        Calculates the cost on the attached data.
-        This function can be overriden, when the cost
-        needs to be calculated differently,
-        for example with a tensorflow model.
-
-        :param Pipeline pipeline: machine learning pipeline
-            that will be evaluated with cross-validation
-
-        :output: dictionary with the aggregated
-            cross-validation score and
-            the score variance.
-        '''
-
-        scores = cross_validate(estimator=pipeline,
-                                X=self._X,
-                                y=self._y,
-                                cv=self._cv or 5,
-                                scoring=make_scorer(self._cost_func),
-                                error_score=np.nan)
-
-        return {'value': self._averaging_func(scores['test_score']),
-                'variance': np.var(scores['test_score'])}
-
-    def _objective(self, space_element: dict) -> dict:
-        '''
-        This method is called in search_for_best_pipeline
-        inside the hyperopt fmin method.
-
-        Uses _evaluate method.
-
-        It must take as input a space element
-        and produce an output in the form of dictionary
-        with 2 obligatory values loss and status
-        (STATUS_OK or STATUS_FAIL). Other
-        values in the output are optional and can be
-        accessed later through the trials object.
-
-        :Warning: fmin minimizes the loss,
-        when _evaluate returns a value to be maximized,
-        it should be multiplied by -1 to obtain loss.
-
-        :param dict space_element: must contain keys
-            name (with the name of the pipeline),
-            pipeline (Pipeline object),
-            params (dict of pipeline params)
-
-        :output: dictionary with keys
-            loss (minimized value),
-            status with values STATUS_OK or STATUS_FAIL
-            uderstood by hyperopt,
-            score (equal to loss or -loss),
-            score_variance,
-            timestamp (end of execution),
-            train_time: execution time
-        '''
-        assert(isinstance(space_element, dict) and
-               set(['name', 'pipeline', 'params']) <= space_element.keys())
-
-        assert(isinstance(space_element['name'], str) and
-               isinstance(space_element['pipeline'], Pipeline) and
-               isinstance(space_element['params'], dict))
-
-        start_time = time.time()
-
-        if not self._data_attached:
-            raise Exception(("Data must be attached in order "
-                             "in order to effectuate the best"
-                             "pipeline search"))
-
-        self._run_number += 1
-
-        pipeline = space_element['pipeline']
-        params = space_element['params']
-        pipeline.set_params(**params)
-
-        self._logger.info(("Run number {0}: "
-                           "Current score is {1}: "
-                           "Training pipeline {2} "
-                           "with parameters: {3}. ").format(
-                             self._run_number,
-                             self._best_score,
-                             space_element['name'],
-                             params))
-
-        try:
-            score_stats = self._evaluate(pipeline)
-            assert(not np.isnan(score_stats["value"])),\
-                "Returned null score"
-
-            if self._run_number % self._backup_trials_freq == 0:
-                self._backup_trials()
-
-            if (self._best_score != self._best_score) or\
-                self._score_factor*score_stats["value"] <\
-                    self._score_factor*self._best_score:
-
-                self._logger.info("Score got better, new best score is: {}"
-                                  .format(score_stats["value"]))
-
-                self._best_score = score_stats['value']
-
-                self._backup_trials()
-
-            end_time = time.time()
-
-            return {'loss': self._score_factor * score_stats["value"],
-                    'status': STATUS_OK,
-                    'score': score_stats["value"],
-                    'score_variance': score_stats["variance"],
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': end_time - start_time}
-
-        except Exception as e:
-
-            self._logger.warning("Trial failed with error {}".format(e))
-
-            return {'loss': np.nan,
-                    'status': STATUS_FAIL,
-                    'score': np.nan,
-                    'score_variance': np.nan,
-                    'timestamp': datetime.datetime.today(),
-                    'train_time': np.nan}
-
-    def search_for_best_pipeline(self,
-                                 niter: int,
-                                 algo: callable = tpe.suggest):
-        '''
-        Method performing the search of the best pipeline in the given space.
-        Calls fmin function from the hyperopt library to minimize the output of
-        _objective.
-
-        :params int niter: number of search iterations
-        :param callable algo: now can only take values tpe for a tree-based
-            random search or random for random search
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        assert(isinstance(niter, int)),\
-            "Parameter 'niter' must be of int type"
-
-        # right now only two algorithms are provided by
-        assert(algo in [tpe.suggest, rand.suggest]),\
-            ("Parameter 'algo' can be now only tpe or random. "
-             "If other algorithms have been developped by "
-             "by hyperopt, plased add them to the list.")
-
-        try:
-            self._logger.info(("Starting {0} iterations of search "
-                               "additional to {1} previous"
-                               .format(niter, len(self._trials.trials))))
-
-            best = fmin(fn=self._objective,
-                        space=space,
-                        algo=algo,
-                        trials=self._trials,
-                        max_evals=len(self._trials.trials) + niter)
-
-            # print('AAAA', str(niter))
-
-            self._logger.info(
-                    "Best score is {0} with variance {1}"
-                    .format(
-                     self._trials.best_trial["result"]["score"],
-                     self._trials.best_trial["result"]["score_variance"]))
-
-            self._logger.info(("Finished {0} iterations of search.\n"
-                               "Best parameters are:\n {1} ")
-                              .format(niter,
-                                      space_eval(space, best)))
-
-            self._backup_trials()
-
-        except Exception as e:
-            raise ValueError(("Failed to select best "
-                             "pipeline! Exit with error: {}").format(e))
-
-    @property
-    def best_trial_score(self) -> float:
-        '''
-        '''
-        if len(self._trials.trials) > 0:
-            return self._trials.best_trial["result"]["score"]
-        else:
-            return np.nan
-
-    @property
-    def best_trial_score_variance(self) -> float:
-        '''
-        '''
-        if len(self._trials.trials) > 0:
-            return self._trials.best_trial["result"]["score_variance"]
-        else:
-            return np.nan
-
-    @property
-    def best_trial_pipeline(self) -> Pipeline:
-        '''
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        if len(self._trials.trials) > 0:
-
-            return space_eval(
-                    space,
-                    {k: v[0] for k, v in
-                     self._trials.best_trial['misc']['vals'].items()
-                     if len(v) > 0})["pipeline"]
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def _ith_trial_loss(self, i: int) -> float:
-        '''
-        '''
-        if len(self._trials.trials) >= i:
-            return self._trials.trials[i]['result']['loss']
-        else:
-            return np.nan
-
-    def _ith_trial_element(self, i: int, name: str) -> object:
-        '''
-        '''
-        assert(self._space_attached),\
-            "Space must be attach to be able to retrieve this information."
-
-        if len(self._trials.trials) >= i:
-            return space_eval(self._space,
-                              {k: v[0] for k, v in
-                               self._trials.trials[i]['misc']['vals']
-                               .items() if len(v) > 0})[name]
-
-    def _ith_trial_pipeline(self, i: int) -> Pipeline:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='pipeline')
-
-    def _ith_trial_name(self, i: int) -> str:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='name')
-
-    def _ith_trial_params(self, i: int) -> dict:
-        '''
-        '''
-        return self._ith_trial_element(i=i, name='params')
-
-    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
-        '''
-        '''
-        if len(self._trials.trials) >= i:
-            return self._trials.trials[i]["result"]["timestamp"]
-
-    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
-        '''
-        Returns the list of n best pipelines
-        documented in trials
-        '''
-        if len(self._trials.trials) > 0:
-            if losses is None:
-                losses = [self._ith_trial_loss(i)
-                          for i in range(len(self._trials.trials))]
-
-            best_n_indices = [losses.index(l)
-                              for l in sorted(list(set(losses)))[:n]]
-
-            return [self._ith_trial_pipeline(i) for i in best_n_indices]
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
-        '''
-        Returns a dictiionry where keys are pipeline names,
-        and values are lists of best pipelines with this name
-        '''
-        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
-
-        if len(self._trials.trials) > 0:
-
-            best_pipelines_per_type = {}
-            names = [self._ith_trial_name(i)
-                     for i in range(len(self._trials.trials))]
-
-            for nm in names:
-                losses = [self._ith_trial_loss(i)
-                          for i in range(len(self._trials.trials))
-                          if self._ith_trial_name(i) == nm]
-
-                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
-                                                        n=n,
-                                                        losses=losses)
-
-            return best_pipelines_per_type
-
-        else:
-            err = ("Trials object is empty. "
-                   "Best pipeline cannot be returned")
-
-            self._logger.error(err)
-            raise Exception(err)
-
-    def write_trials_documentation(self, path: str = None):
-        '''
-        Saves an excel file with pipeline names, scores,
-        parameters, and timestamps.
-        '''
-        if len(self._trials.trials) > 0:
-            path = path or "hyperopt_trials_documentation.xlsx"
-
-            assert(isinstance(path, str)),\
-                "Parameter 'path' must be of string type"
-
-            self._assert_valid_directory(path)
-
-            names = [self._ith_trial_name(i)
-                     for i in range(len(self._trials.trials))]
-            scores = [self._score_factor*self._ith_trial_loss(i)
-                      for i in range(len(self._trials.trials))]
-            params = [self._ith_trial_params(i)
-                      for i in range(len(self._trials.trials))]
-            timestamps = [self._ith_trial_timestamp(i)
-                          for i in range(len(self._trials.trials))]
-
-        else:
-            names = []
-            scores = []
-            params = []
-            timestamps = []
-
-        pd.DataFrame({"name": names,
-                      "score": scores,
-                      "params": params,
-                      "timestamp": timestamps})\
-          .to_excel(path)
-
-
-if __name__ == '__main__':
-
-    from sklearn.metrics import roc_auc_score, make_scorer
-    from xgboost import XGBClassifier
-    from sklearn.svm import SVC
-    from sklearn.feature_selection import SelectKBest
-    from sklearn.decomposition import PCA
-    from sklearn.datasets import load_iris
-    from pprint import pprint
-
-    data = load_iris()
-    X = pd.DataFrame(data.data)
-    y = pd.Series(data.target)
-    # produce a binory variable
-    y = (y == 2).astype(int)
-    del data
-    gc.collect()
-
-    # SPACE DEFINITION ########################################
-    # (can be moved to a separate python script)
-
-    """
-    A search space must be a list of dictionaries.
-    Each dictionry must have keys:
-        name (pipeline name or type),
-        pipeline (instance of sklearn.pipeline.Pipeline),
-        params (dictionary of distributions for the parameters of
-                the pipeline that we want to tune)
-
-    Here we have a space that consists of two dictionaries:
-    KBEST_XGBOOST and PCA_SVC
-    """
-    space = []
-
-    pipeline_dist_1 = {}
-    pipeline_dist_1["name"] = "KBEST_XGBOOST"
-
-    """
-    A pipeline consists of steps (tuples).
-    Each step has a name and an algorithm.
-    This pipeline, as a first step performs
-    feature selection with SelectKBest and
-    as a second step evaluates a machine learning algo (xgboost).
-
-    Like all sklearn algorithms, a Pipeline has methods
-    fit, predict, set_params, get_params
-    """
-    pipeline_dist_1["pipeline"] = Pipeline([
-                                     ('kbest', SelectKBest()),
-                                     ('xgb', XGBClassifier())
-                                     ])
-    """
-    Pipeline parameter dictionaries must be of the form:
-    {'kbest__k': 3, xgb__n_estimators: 20},
-    each parameter name consists of the step name, __, and parameter name.
-
-    Here, instead of values, the parameter names are followed
-    by hyperopt distributions.
-    Each hyperopt distribution also must have a name,
-    due to hyperopt functionality.
-
-    Here, we set the hyperopt distribution name to the step name,
-    but it does not have to be so. Hyperopt distribution names
-    must be different for different elements of the space.
-    """
-
-    pipeline_dist_1["params"] = {
-            'kbest__k': hp.choice('kbest__k', range(1, 5)),
-
-            'xgb__n_estimators':
-            50 + hp.randint('xgb__n_estimators', 50),
-
-            "xgb__learning_rate":
-            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
-            }
-
-    space.append(pipeline_dist_1)
-
-    pipeline_dist_2 = {}
-    pipeline_dist_2["name"] = "PCA_SVC"
-
-    pipeline_dist_2["pipeline"] = Pipeline([
-                                     ('pca', PCA()),
-                                     ('svc', SVC(gamma="scale"))
-                                     ])
-
-    pipeline_dist_2["params"] = {
-            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
-
-            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
-            }
-
-    space.append(pipeline_dist_2)
-
-    space = hp.choice('pipelines', space)
-
-    # TESTING ##########################################################
-
-    trials_path = 'TEST_hyperopt_trials.pkl'
-
-    doc_path = 'TEST_hyperopt_doc.xlsx'
-
-    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
-                                       greater_is_better=True,
-                                       trials_path=trials_path)
-
-    hp_obj.attach_data(X_train=X, y_train=y)
-
-    hp_obj.attach_space(space=space)
-
-    hp_obj.search_for_best_pipeline(niter=10)
-
-    print('\n', '='*20, 'TESTING', '='*20)
-
-    print('\n', 'Best score:', hp_obj.best_trial_score)
-
-    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
-
-    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
-
-    print('\n', 'Best 3 pipelines: \n')
-    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
-
-    print('\n', 'Best pipeline per type: \n')
-    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
-
-    hp_obj.write_trials_documentation(path=doc_path)
-
-    # os.remove(doc_path)
-    # os.remove(trials_path)

+ 496 - 0
cdplib/hyperopt/HyperoptPipelineSelector.py

@@ -0,0 +1,496 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Oct  6 15:04:25 2020
+
+@author: tanya
+@description:a class for selecting a machine learning
+ pipeline from a deterministic space of parameter distributions
+ over multiple pipelines.
+ The selection is though in such a way that a Trials object is being
+ maintained during the tuning process from which one can retrieve
+ the best pipeline so far as well as the entire tuning history
+ if needed.
+"""
+
+import os
+
+import pickle
+
+from copy import deepcopy
+
+import datetime
+
+import pandas as pd
+import numpy as np
+
+from sklearn.pipeline import Pipeline
+
+from hyperopt import fmin, tpe, rand, Trials, space_eval
+
+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector,\
+     SpaceElementType
+
+from typing import Callable, Optional, Literal, Dict, Union, List
+
+
+class HyperoptPipelineSelector(PipelineSelector):
+    """
+    Use this class to perform a search
+    for a machine learning pipeline in a given parameter space.
+    The parameter space can include multiple types of Pipelines
+    (SVM, XGBOOST, random forest, etc),
+    as well as parameter distributions for each pipeline parameter.
+    See example in main for the expected space structure.
+
+    The search can be performed either randomly
+    or with a tree-based algorithm. (Other methods are currently
+    developped by hyperopt creators).
+
+    Attribute trials is responsible for book-keeping parameter
+    combinations that have already been tried out. This attribute
+    is saved to a binary file every n minutes as well as every time
+    a better pipeline was found.
+    """
+    def __init__(self,
+                 cost_func: Union[Callable, str],
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: Optional[int] = None,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"):
+        """
+        param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from scratch.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
+
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to save
+            of the form {"metric_name": metric} where metric is a Callable.
+
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+
+        try:
+
+            super().__init__(cost_func=cost_func,
+                             greater_is_better=greater_is_better,
+                             trials_path=trials_path,
+                             backup_trials_freq=backup_trials_freq,
+                             cross_val_averaging_func=cross_val_averaging_func,
+                             additional_metrics=additional_metrics,
+                             strategy_name=strategy_name,
+                             stdout_log_level=stdout_log_level)
+
+            self._logger = Log("HyperoptPipelineSelector: ",
+                               stdout_log_level=stdout_log_level)
+
+            self._trials = self._trials or Trials()
+
+        except Exception as e:
+            err = "Failed to intialize. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    def run_trials(self,
+                   niter: int,
+                   algo: Literal[tpe.suggest, rand.suggest] = tpe.suggest)\
+            -> None:
+        '''
+        Method performing the search of the best pipeline in the given space.
+        Calls fmin function from the hyperopt library to minimize the output of
+        _objective.
+
+        :params int niter: number of search iterations
+        :param algo: now can only take supported by the hyperopt library.
+            For now these are tpe.suggest for a tree-based bayesian search
+            or rad.suggest for randomized search
+        '''
+        try:
+            self._trials = self._trials or Trials()
+
+            self._logger.info(("Starting {0} iterations of search "
+                               "additional to {1} previous"
+                               .format(niter, len(self._trials.trials))))
+
+            best_trial = fmin(fn=self._objective,
+                              space=self._space,
+                              algo=algo,
+                              trials=self._trials,
+                              max_evals=len(self._trials.trials) + niter)
+
+            self._logger.info(
+                    "Best score is {0} with variance {1}"
+                    .format(
+                     self._trials.best_trial["result"]["score"],
+                     self._trials.best_trial["result"]["score_variance"]))
+
+            self._logger.info(("Finished {0} iterations of search.\n"
+                               "Best parameters are:\n {1} ")
+                              .format(niter,
+                                      space_eval(self._space, best_trial)))
+
+            self.finished_tuning = True
+
+            self.total_tuning_time = datetime.datetime.today()\
+                - self.start_tuning_time
+
+            self._backup_trials()
+
+        except Exception as e:
+            err = ("Failed to select best "
+                   "pipeline! Exit with error: {}").format(e)
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def number_of_trials(self) -> Union[int, None]:
+        """
+        :return: number of trials run so far
+            with the given Trials object
+        """
+
+        try:
+            return len(self._trials.trials)
+
+        except Exception as e:
+            err = ("Failed to retrieve the number of trials. "
+                   "Exit with error {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _get_space_element_from_trial(self, trial: Dict)\
+            -> Union[Dict[SpaceElementType], None]:
+        """
+        Hyperopt trials object does not contain the space
+             elements that result in the corresponding trials.
+             One has to use the function space_eval from
+             hyperopt to get the space element.
+
+        After retrieving the space element,
+            parameters of the pipeline are set.
+        """
+        try:
+            trial = deepcopy(trial)
+
+            assert(self.attached_space),\
+                "Hyperparameter space not attached."
+
+            space_element = space_eval(self._space,
+                                       {k: v[0] for k, v in
+                                        trial['misc']['vals'].items()
+                                        if len(v) > 0})
+
+            pipeline = deepcopy(space_element["pipeline"])
+            params = deepcopy(space_element["params"])
+            pipeline.set_params(**params)
+
+            space_element["pipeline"] = pipeline
+
+            return space_element
+
+        except Exception as e:
+            err = ("Failed to retrieve a space element from a trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _get_space_element_from_index(self, i: int)\
+            -> Union[Dict[SpaceElementType], None]:
+        """
+        Gets the space element of shape
+        {"name": NAME, "params": PARAMS, "pipeline": PIPELINE}
+        from the trial number i.
+        """
+        try:
+            assert(len(self._trials.trials) > i),\
+                ("Trials object is not long enough "
+                 "to retrieve index {}".format(i))
+
+            return self._get_space_element_from_trial(self._trials.trials[i])
+
+        except Exception as e:
+            err = ("Failed to get space element from index. "
+                   "Exit with error {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _get_pipeline_from_index(self, i: int) -> Union[Pipeline, None]:
+        """
+        Gets a pipeline with set parameters from the trial number i
+        """
+        try:
+            space_element = self._get_space_element_from_index(i)
+
+            return space_element["pipeline"]
+
+        except Exception as e:
+            err = ("Failed to retrieve pipeline from index. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial(self) -> Union[Dict, None]:
+        """
+        :return: dictionary with the summary of the best trial
+            and space element (name, pipeline, params)
+            resulting in the best trial
+        """
+        if len(self._trials.trials) == 0:
+
+            self._logger.log_and_throw_warning("Trials object is empty")
+            return {}
+
+        else:
+
+            try:
+                best_trial = deepcopy(self._trials.best_trial)
+
+                if self.attached_space:
+
+                    space_element = self._get_space_element_from_trial(
+                            best_trial)
+                else:
+                    space_element = {}
+
+                    warn = ("Space is not attached, "
+                            "To included the best pipeline "
+                            "attach the space")
+                    self._logger.log_and_throw_warning(warn)
+
+                best_trial = deepcopy(self._trials.best_trial["result"])
+
+                best_trial.update(space_element)
+
+                return best_trial
+
+            except Exception as e:
+                err = "Failed to retrieve best trial. Exit with error: {}"\
+                    .format(e)
+
+                self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score(self) -> Union[float, None]:
+        """
+        """
+        try:
+            if len(self.best_trial) > 0:
+                return self.best_trial["score"]
+            else:
+                return np.nan
+
+        except Exception as e:
+            err = ("Failed to retrieve best trial score. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score_variance(self) -> Union[float, None]:
+        """
+        """
+        try:
+            if len(self.best_trial) > 0:
+                return self.best_trial["score_variance"]
+            else:
+                return np.nan
+
+        except Exception as e:
+            err = ("Failed to retrieve best trial score variance. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
+        """
+        """
+        try:
+            if len(self.best_trial) > 0:
+                return self.best_trial["pipeline"]
+            else:
+                return np.nan
+
+        except Exception as e:
+            err = ("Failed to retrieve best trial pipeline. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines(self, n: int)\
+            -> Union[List[Pipeline], None]:
+        """
+        :return: the list of n best pipelines
+        documented in trials
+        """
+        try:
+            if len(self._trials.trials) == 0:
+                return []
+            else:
+                n_best_trials = sorted(self._trials.trials,
+                                       key=lambda x: x["result"]["score"],
+                                       reverse=True)[:n]
+
+                return [self._get_space_element_from_trial(trial)["pipeline"]
+                        for trial in n_best_trials]
+
+        except Exception as e:
+            err = ("Failed to retrieve n best pipelines. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
+            -> Union[Dict[str, List[Pipeline]], None]:
+        """
+        :return: a dictiionry where keys are pipeline names,
+        and values are lists of best pipelines with this name
+        """
+        try:
+            scores = [trial["result"]["score"]
+                      for trial in self._trials.trials]
+
+            names = [self._get_space_element_from_trial(trial)["name"]
+                     for trial in self._trials.trials]
+
+            return pd.DataFrame({"name": names, "score": scores})\
+                     .sort_values(by=["name", "score"], ascending=False)\
+                     .groupby("name")\
+                     .head(n)\
+                     .reset_index()\
+                     .assign(pipeline=lambda x: x["index"]
+                             .apply(self._get_pipeline_from_index))\
+                     .groupby("name")["pipeline"]\
+                     .apply(lambda x: list(x))\
+                     .to_dict()
+
+        except Exception as e:
+            err = ("Failed to get n best pipelines of each type. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def trials_to_excel(self, path: str = None) -> None:
+        """
+        Saves an excel file with pipeline names, scores,
+        parameters, and timestamps.
+        """
+        try:
+            results = [trial["result"] for trial in self._trials.trials]
+
+            space_elements = [self._get_space_element_from_trial(trial)
+                              for trial in self._trials.trials]
+
+            pd.DataFrame([{**result, **space_element}
+                          for result, space_element in
+                          zip(results, space_elements)]).to_excel(path)
+
+        except Exception as e:
+            err = ("Failed to write trials to excel. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+
+if __name__ == '__main__':
+
+    # elementary example
+
+    from sklearn.metrics import roc_auc_score, precision_score
+    from sklearn.datasets import load_breast_cancer
+    from cdplib.log import Log
+    from cdplib.db_handlers import MongodbHandler
+    from cdplib.hyperopt.space_sample import space
+    # from cdplib.hyperopt.composed_space_sample import space
+
+    trials_path = "hyperopt_trials_TEST.pkl"
+    additional_metrics = {"precision": precision_score}
+    strategy_name = "strategy_1"
+    data_path = "data_TEST.h5"
+    cv_path = "cv_TEST.pkl"
+    collection_name = 'TEST_' + strategy_name
+
+    logger = Log("HyperoptPipelineSelector__TEST:")
+
+    logger.info("Start test")
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
+    pd.Series(y).to_hdf(data_path, key="y_train")
+
+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
+
+    pickle.dump(cv, open(cv_path, "wb"))
+
+    hs = HyperoptPipelineSelector(cost_func=roc_auc_score,
+                                  greater_is_better=True,
+                                  trials_path=trials_path,
+                                  additional_metrics=additional_metrics,
+                                  strategy_name=strategy_name,
+                                  stdout_log_level="WARNING")
+
+    hs.attach_space(space=space)
+
+    hs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
+                             cv_pickle_path=cv_path)
+
+    try:
+
+        # TODO: this line causes a pytype to throw not-callable error
+        # works fine with pytype on other class methods.
+        save_method = MongodbHandler().insert_data_into_collection
+        save_kwargs = {'collection_name': collection_name}
+
+        # save_method = pd.DataFrame.to_excel()
+        # save_kwargs = {'excel_writer': "TEST.xlsx"}
+
+        hs.configer_summary_saving(save_method=save_method,
+                                   kwargs=save_kwargs)
+
+        logger.info("Configured summary saving in mongo")
+
+    except Exception as e:
+
+        logger.warning(("Could not configure summary saving in mongo. "
+                        "Exit with error: {}".format(e)))
+
+    hs.run_trials(niter=10)
+
+    logger.info("Best Trial: {}".format(hs.best_trial))
+    logger.info("Total tuning time: {}".format(hs.total_tuning_time))
+
+    for file in [trials_path, data_path, cv_path]:
+        os.remove(file)
+
+    logger.info("End test")

+ 0 - 0
cdplib/hyperopt/__init__.py


+ 116 - 0
cdplib/hyperopt/composed_space_sample.py

@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul  6 14:02:24 2020
+
+@author: tanya
+@description: space object to pass to HyperoptPipelineSelection class
+"""
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
+    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
+from xgboost import XGBRFClassifier
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from hyperopt import hp
+
+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
+
+# TODO: add sample spaces for encoders and transformers
+
+encoders = []
+
+transformers = []
+
+selectors = [
+    {"name": "kbest",
+     "object": SelectPercentile(),
+     "params": {
+       "percentile": 3 + hp.randint("kbest__percentile", 60),
+       "score_func": hp.choice("kbest__score_func",
+                               [f_classif, chi2, mutual_info_classif])}},
+
+    {"name": "fpr",
+     "object": SelectFpr(),
+     "params": {
+        "score_func": hp.choice("fpr__score_func",
+                                [f_classif, chi2]),
+        # mutual_info_classif does not work here
+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
+
+    {"name": "rfe_rf",
+     "object":
+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
+     "params": {
+         "n_features_to_select":
+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
+         "estimator__n_estimators":
+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_rf",
+     "object":
+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
+                                                          random_state=33)),
+     "params": {
+         "estimator__n_estimators":
+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_lr",
+     "object":
+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
+                                                      random_state=33)),
+     "params": {
+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
+
+    {"name": "std_scaler_pca",
+     "object": Pipeline([
+             ("scaler", StandardScaler()),
+             ("pca", PCA(random_state=33))]),
+     "params": {
+        "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
+       }}
+    ]
+
+models = [
+        {"name": "xgb",
+         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
+           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
+           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
+           }},
+
+        {"name": "rf",
+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
+           }},
+
+        # the default solver does not accept l1 penalty
+        {"name": "lr",
+         "object": LogisticRegression(random_state=33,
+                                      solver='liblinear',
+                                      # n_jobs=-1
+                                      ),
+         "params":  {
+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
+           "C": hp.uniform("lr__C", 0.1, 1000)}},
+
+        # svc does not support parallelizaiton, therefore is slow
+        {"name": "svc",
+         "object": SVC(random_state=33),
+         "params": {
+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
+            "degree": 2 + hp.randint("svc__degree", 3),
+            "C": hp.uniform("svc__C", 0.1, 1000)
+            }}
+        ]
+
+step_list = [encoders, transformers, selectors, models]
+
+space = SpaceComposer().compose_hyperopt_space(step_list)

+ 40 - 0
cdplib/hyperopt/space_sample.py

@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct  5 09:50:24 2020
+
+@author: tanya
+"""
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from hyperopt import hp
+import numpy as np
+
+
+space = hp.choice("pipelines", [
+
+        {"name": "std_scaler_kbest_rf",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("kbest", SelectPercentile()),
+                 ("rf", RandomForestClassifier())]),
+         "params": {"kbest__percentile":
+                    hp.choice('kbest__percentile', range(1, 3)),
+                    "rf__n_estimators":
+                    50 + hp.randint('rf__n_estimators', 50)}},
+
+        {"name": "std_scaler_pca_lr",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("pca", PCA()),
+                 ("lr", LogisticRegression())]),
+         "params": {"lr__C":
+                    hp.loguniform("lr__C", np.log(0.01), np.log(0.1)),
+                    "pca__n_components":
+                    1 + hp.randint("pca__n_components", 4)}}
+        ])

+ 85 - 0
cdplib/hyperparameter_space_composer/SpaceComposer.py

@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 13:54:04 2020
+
+@author: tanya
+@description: a class that from a given list of pipeline steps
+ composes a space to be passed in the GridsearchPipelineSelector
+ or HyperoptPipelineSelector classes.
+ A classic list of steps would be: [encoders, transformers, selectors, models]
+"""
+from sklearn.pipeline import Pipeline
+from hyperopt import hp
+from itertools import product
+
+
+class SpaceComposer:
+    """
+    A class that from a given list of pipeline steps
+    composes a space to be passed to GridsearchPipelineSelector
+    or HyperoptPipelineSelector.
+    """
+    def compose_gridsearch_space(self, step_list: list) -> list:
+        """
+        Composes a hyperparameter space for input to the
+        GridsearchPipelineSelector class.
+
+        :param step_list: a classic list of steps would be
+        [encoders, transformers, selectors, models],
+        where, for example, selectors is a list
+        of sklearn feature selectors, each selector given as a dict:
+        for example {"name": "kbest",
+                     "object": SelectPercentile(),
+                     "params": {
+                             "percentile":
+                                 [5, 10, 20],
+                             "score_func":
+                                 [f_classif, chi2, mutual_info_classif]}}
+
+        :return: a list of dictionaries of form
+            {"name": NAME, "pipeline": PIPELINE, "params": PARAMS}
+        """
+        space = []
+
+        step_combinations = product(*[step for step in
+                                      step_list if len(step) > 0])
+
+        for step_combination in step_combinations:
+
+            space_element = {}
+
+            space_element["name"] = "_".join([step["name"]
+                                              for step in step_combination])
+
+            space_element["pipeline"] = Pipeline(
+                    [(step["name"], step["object"])
+                     for step in step_combination])
+
+            space_element["params"] =\
+                {step["name"] + "__" + param_name: param_dist
+                 for step in step_combination
+                 for param_name, param_dist
+                 in step["params"].items()}
+
+            space.append(space_element)
+
+        return space
+
+    def compose_hyperopt_space(self, step_list: list) -> hp.choice:
+        """
+        Composes a hyperopt space from a list of steps.
+        A classic list of steps would be
+        [encoders, transformers, selectors, models],
+        where, for example, selectors is a list
+        of sklearn feature selectors, each selector given as a dict:
+        for example {"name": "kbest",
+                     "object": SelectPercentile(),
+                     "params": {
+                             "percentile":
+                                 3 + hp.randint("kbest__percentile", 200),
+                             "score_func":
+                                 hp.choice("kbest__score_func",
+                                    [f_classif, chi2, mutual_info_classif])}}
+        """
+        return hp.choice("pipelines", self.compose_gridsearch_space(step_list))

+ 12 - 11
cdplib/log.py

@@ -6,6 +6,7 @@
 import sys
 import os
 import logging
+import warnings
 from datetime import datetime
 
 sys.path.append(os.getcwd())
@@ -15,6 +16,7 @@ class Log():
     '''
     '''
     pass
+
     def __init__(self, name: str = None,
                  log_file: str = None,
                  log_level: str = "ERROR",
@@ -29,7 +31,6 @@ class Log():
             name = ''
 
         self._logger = logging.getLogger(name)
-        
 
         self._logger.setLevel("DEBUG")
 
@@ -37,7 +38,9 @@ class Log():
             self._logger.handlers.clear()
 
         if log_file is None:
-            log_file = os.path.join(".", "logs", str(datetime.today().date()) + ".log")
+            log_file = os.path.join(".",
+                                    "logs",
+                                    str(datetime.today().date()) + ".log")
 
         assert(isinstance(log_file, str)),\
             "Parameter 'log_path' must be of string type"
@@ -60,7 +63,6 @@ class Log():
 
         # self._logger.setLevel(log_level)
 
-
     @property
     def magenta(self):
         return '\033[95m'
@@ -97,7 +99,6 @@ class Log():
     def underline(self):
         return '\033[4m'
 
-
     def info(self, message: str):
         self._logger.info(message)
 
@@ -107,23 +108,23 @@ class Log():
     def error(self, message: str):
         self._logger.error(message)
 
-    def log_and_raise_error(self, message):
+    def log_and_raise_error(self, message, ErrorType=Exception):
         '''
         '''
         self._logger.error(message, exc_info=True)
 
-        raise Exception(message)
+        raise ErrorType(message)
 
-    def log_and_raise_error_stack_info(self, message):
+    def log_and_raise_error_stack_info(self, message, ErrorType=Exception):
         '''
         '''
         self._logger.error(message, exc_info=True, stack_info=True)
 
-        raise Exception(message)
+        raise ErrorType(message)
 
-    def log_and_raise_warning(self, message):
+    def log_and_throw_warning(self, message):
         '''
         '''
-        self._logger.warning(message)
+        self._logger.warning(message, exc_info=True)
 
-        raise Warning(message)
+        warnings.warn(message)

+ 208 - 0
cdplib/ml_validation/CVComposer.py

@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Dec  9 10:27:39 2020
+
+@author: tanya
+"""
+
+from typing import Union, Iterable, Tuple, List, NewType
+import pandas as pd
+import numpy as np
+from itertools import accumulate, repeat, takewhile, chain
+
+from cdplib.log import Log
+
+
+CVType = NewType("CVType", Iterable[Tuple[List]])
+
+DataSetType = NewType("DataSetType",
+                      Union[pd.DataFrame, pd.Sereis, np.ndarray, List])
+
+
+class CVComposer:
+    """
+    Groups methods for composing cv objects
+    that follow standards from sklearn,
+    these cv objects can be passed to algorithms like gridsearch, etc
+    """
+    def __init__(self):
+        """
+        """
+        self._logger = Log("CVComposer: ")
+
+    def dummy_cv(
+            self,
+            train_set_size: Union[int, None] = None,
+            train_index: Union[pd.Series, np.ndarray, None] = None,
+            test_set_size: Union[int, None] = None,
+            test_index: DataSetType = None) -> CVType:
+        """
+        """
+        assert((train_index is None) != (train_set_size is None)),\
+            "Set train_index or train_set_size"
+
+        assert((test_index is None) != (test_set_size is None)),\
+            "Set train_index or train_set_size"
+
+        train_index = train_index if (train_index is not None)\
+            else list(range(train_set_size))
+
+        test_index = test_index if (test_index is not None)\
+            else list(range(train_set_size, train_set_size + test_set_size))
+
+        return [(train_index, test_index)]
+
+    def dummy_cv_and_concatenated_data_set(
+            self,
+            X_train: DataSetType,
+            y_train: Union[DataSetType, None] = None,
+            X_test: DataSetType,
+            y_test: Union[DataSetType, None] = None)\
+            -> Tuple[DataSetType, DataSetType, CVType]:
+        """
+        """
+        assert((y_test is None) == (y_train is None))
+
+        use_index = (isinstance(X_train, pd.DataFrame) and
+                     isinstance(X_test, pd.DataFrame) and
+                     (len(set(X_train.index) and set(X_test.index)) == 0))
+
+        if use_index:
+
+            cv = self.dummy_cv(train_index=X_train.index,
+                               test_index=X_test.index)
+
+            X = pd.concat([X_train, X_test], ignore_index=False, axis=0)
+
+        else:
+            cv = self.dummy_cv(train_size=len(X_train),
+                               test_size=len(X_test))
+
+            X = np.concatenate([X_train, X_test])
+
+        use_target_index = use_index and (
+                    isinstance(y_train, pd.Series) and
+                    isinstance(y_test, pd.Series) and
+                    (X_train.index.equals(y_train.index)) and
+                    (X_test.index.equals(y_test.index)))
+
+        if use_target_index:
+
+            y = pd.concat([y_train, y_test], ignore_index=False, axis=0)
+
+        else:
+
+            y = np.concatenate([y_train, y_test]) if (y_train is not None)\
+                else None
+
+        result_to_np = (
+            (isinstance(X_train, pd.DataFrame) !=
+             isinstance(X_test, pd.DataFrame)) or
+            (isinstance(X_train, pd.DataFrame)) and
+            (len(set(X_train.index) and set(X_test.index)) != 0))
+
+        if result_to_np:
+            self._logger.log_and_throw_warning(
+                    "The concatenated dataframe is converted to numpy")
+
+        return cv, X, y
+
+    def expanding_cv(self, test_proportion: float,
+                     start_train_proportion: float,
+                     step_proportion: float = None,
+                     expanding_test_size: bool = False,
+                     data_set_size: Union[float, None] = None,
+                     index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Union[Iterable[Tuple[List]], None]:
+        """
+        """
+        try:
+            assert((index is None) != (data_set_size is None)),\
+                "Set index or data_set_size"
+
+            index = pd.Series(index) if (index is not None)\
+                else pd.Series(range(data_set_size))
+
+            data_set_size = data_set_size or len(index)
+
+            start_train_size = int(start_train_proportion * data_set_size)
+            step_size = int(step_proportion * data_set_size)
+
+            test_size = int(test_proportion * data_set_size)
+
+            train_inds_set = (list(range(train_size))
+                              for train_size in
+                              takewhile(
+                                      lambda x: x <= data_set_size - test_size,
+                                      accumulate(repeat(start_train_size),
+                                                 lambda x, _: x + step_size)))
+
+            for train_inds in train_inds_set:
+
+                if expanding_test_size:
+
+                    yield (index[train_inds],
+                           index[train_inds[-1] + 1:
+                                 train_inds[-1] + 1
+                                 + int(test_proportion*len(train_inds))])
+
+                else:
+
+                    yield (index[train_inds],
+                           index[train_inds[-1] + 1:
+                                 train_inds[-1] + 1 + test_size])
+
+        except Exception as e:
+            self._logger.log_and_raise_error(("Failed to make expanding cv. "
+                                              "Exit with error: {}".format(e)))
+
+    def sliding_window_cv(
+        self,
+        test_proportion: float,
+        train_proportion: float,
+        step_proportion: float = None,
+        data_set_size: Union[float, None] = None,
+        index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Union[Iterable[Tuple[List]], None]:
+        """
+        """
+        try:
+            assert((index is None) != (data_set_size is None)),\
+                "Set index or data_set_size"
+
+            index = pd.Series(index) if (index is not None)\
+                else pd.Series(range(data_set_size))
+
+            data_set_size = data_set_size or len(index)
+
+            train_size = int(train_proportion * data_set_size)
+            test_size = int(test_proportion * data_set_size)
+            step_size = int(step_proportion * data_set_size)
+
+            train_sizes = takewhile(lambda x: x <= data_set_size - test_size,
+                                    accumulate(repeat(train_size),
+                                               lambda x, _: x + step_size))
+
+            train_starts = takewhile(lambda x: x <= data_set_size
+                                     - train_size - test_size,
+                                     accumulate(repeat(step_size),
+                                                lambda x, _: x + step_size))
+
+            train_starts = chain([0], train_starts)
+
+            train_inds_set = list(range(train_start, train_size)
+                                  for train_start, train_size in
+                                  zip(train_starts, train_sizes))
+
+            cv = ((index[train_inds], index[train_inds[-1] + 1:
+                                            train_inds[-1] + 1 + test_size])
+                  for train_inds in train_inds_set)
+
+            return cv
+
+        except Exception as e:
+            self._logger.log_and_raise_error(
+                    ("Failed to make sliding window cv. "
+                     "Exit with error: {}".format(e)))
+

+ 0 - 0
cdplib/ml_validation/__init__.py


+ 491 - 0
cdplib/ml_validation/cross_validate_with_fine_tuning.py

@@ -0,0 +1,491 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 29 13:58:23 2020
+
+@author: tanya
+
+
+@description:
+
+* Input:
+    - pipeline/hyperparameter space
+    - data_train
+    - cv
+    - cv_folds
+
+* For each pipeline:
+
+    -> Split data_train into folds according to cv
+
+     -> For each fold:
+
+         => get data_train_fold, data_test_fold, cv_fold
+
+         => split data_train_fold into subfolds according to cv_fold
+
+         => For each subfold:
+
+             ==> get data_train_subfold, data_test_subfold
+
+             ==> train pipeline on data_train_subfold
+
+             ==> find best_threshold_subfold on data_test_subfold
+
+        => Find averaged_threshold_fold averaged over best_threshold_subfold
+
+        => train pipeline on data_train_fold
+
+        => find score_fold on data_test_fold with proba_threshold_fold
+
+        => find best_threshold_fold on data_test_fold
+
+    -> find score averaged over score_fold
+
+    -> find averaged_threshold averaged over best_threshold_fold
+
+* choose (pipeline/hyperparameters, threshold) in the space with best score
+
+"""
+
+import pandas as pd
+import numpy as np
+from itertools import zip_longest
+from typing import Union, Callable, Dict, Iterable, Tuple, List
+from copy import deepcopy
+from itertools import accumulate, repeat, takewhile, chain
+
+from sklearn.model_selection import StratifiedKFold
+
+from cdplib.log import Log
+
+
+
+
+
+aa = make_sliding_window_cv(data_set_size=50,
+                            test_proportion=0.1,
+                            train_proportion=0.6,
+                            step_proportion=0.1)
+
+aa = list(aa)
+
+aa = make_sliding_window_cv(test_proportion=0.1,
+                            train_proportion=0.6,
+                            step_proportion=0.05,
+                            index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
+
+aa = list(aa)
+
+
+# TODO: write with yield !!!!
+
+def make_nested_expanding_cv(
+        test_proportion: float,
+        start_train_proportion: float,
+        step_proportion: float = None,
+        expanding_test_size: bool = False,
+        data_set_size: Union[float, None] = None,
+        index: Union[pd.Series, np.ndarray, list, None] = None)\
+        -> Iterable[Tuple[List]]:
+    """
+    """
+    logger = Log("make_nested_expanding_cv:")
+
+    try:
+        cv = make_expanding_cv(test_proportion=test_proportion,
+                               start_train_proportion=start_train_proportion,
+                               step_proportion=step_proportion,
+                               expanding_test_size=expanding_test_size,
+                               data_set_size=data_set_size,
+                               index=index)
+
+        nested_cv = []
+
+        for train_inds, test_inds in cv:
+
+            fold_index = train_inds if index is not None\
+                else None
+
+            fold_size = len(train_inds) if index is None else None
+
+            fold_cv = make_expanding_cv(
+                    test_proportion=test_proportion,
+                    start_train_proportion=start_train_proportion,
+                    step_proportion=step_proportion,
+                    expanding_test_size=expanding_test_size,
+                    data_set_size=fold_size,
+                    index=fold_index)
+
+            nested_cv.append(list(fold_cv))
+
+        return nested_cv
+
+    except Exception as e:
+        logger.log_and_raise_error(("Failed to make nested expanding cv. "
+                                    "Exit with error: {}".format(e)))
+
+
+
+
+for train_inds, test_inds in aa:
+    print(len(test_inds)/(len(train_inds) + len(test_inds)))
+    print(len(test_inds)/50)
+
+aaa = list(aaa)
+
+for aaa_cv in aaa:
+    for train_inds, test_inds in aaa_cv:
+        print(len(test_inds)/(len(train_inds) + len(test_inds)))
+        print(len(test_inds)/50)
+
+aaa = make_nested_expanding_cv(#data_set_size=50,
+                               test_proportion=0.1,
+                               start_train_proportion=0.6,
+                               step_proportion=0.1,
+                               index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
+
+aaa = list(aaa)
+
+
+
+
+
+def cv_slice_dataset(X, y, train_inds, test_inds)\
+        -> Tuple[Union[pd.DataFrame, np.ndarray],
+                 Union[pd.Series, np.ndarray]]:
+    """
+    """
+    if isinstance(X, pd.DataFrame):
+        X_train = X.loc[train_inds]
+        X_val = X.loc[test_inds]
+    else:
+        X_train = X[train_inds]
+        X_val = X[test_inds]
+
+    if y is not None:
+        y_train = y[train_inds]
+        y_val = y[test_inds]
+
+    return X_train, X_val, y_train, y_val
+
+
+def get_optimal_proba_threshold(score_func: Callable,
+                                y_true: Union[pd.Series, np.ndarray],
+                                proba: Union[pd.Series, np.ndarray],
+                                threshold_set: Union[Iterable, None] = None):
+    """
+    """
+    scores = {}
+
+    if threshold_set is None:
+        threshold_set = np.arange(0, 1, 0.1)
+
+    for threshold in threshold_set:
+
+        y_pred = (proba >= threshold).astype(int)
+
+        scores[threshold] = score_func(y_true, y_pred)
+
+    return max(scores, key=scores.get)
+
+
+def cross_validate_with_optimal_threshold(
+        estimator: object,
+        score_func: Callable,
+        X_train: Union[pd.DataFrame, np.ndarray],
+        y_train: Union[pd.Series, np.ndarray, None] = None,
+        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
+        y_val: Union[pd.Series, np.ndarray, None] = None,
+        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
+        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
+        cv: Union[Iterable, int, None] = None,
+        cv_threshold: Union[Iterable, int, None] = None,
+        additional_metrics: Union[Dict[str, Callable], None] = None,
+        threshold_set: Union[Iterable, None] = None,
+        scores: Dict = None)\
+            -> Dict:
+    """
+    """
+    logger = Log("cross_validate_with_optimal_threshold:")
+
+    X_train = deepcopy(X_train)
+    y_train = deepcopy(y_train)
+    X_val = deepcopy(X_val)
+    y_val = deepcopy(y_val)
+    X_val_threshold = deepcopy(X_val_threshold)
+    y_val_threshold = deepcopy(y_val_threshold)
+
+    scores = scores or {"test_threshold": [],
+                        "test_score": [],
+                        "train_score": []}
+
+    additional_metrics = additional_metrics or {}
+
+    for metric_name, metric in additional_metrics.items():
+        if "test_" + metric_name not in scores:
+            scores["test_" + metric_name] = []
+            scores["train_" + metric_name] = []
+
+    if cv is None:
+
+        # test score is calculated on X_vals
+
+        assert((X_val is not None) and (y_val is not None)),\
+            "Validation set must be set"
+
+        if cv_threshold is None:
+
+            refit = (X_val_threshold is not None)
+
+            # if a validation set for proba threshold tuning is not given,
+            # we use the validation set on which we calculate the test score
+            # (this might lead to overfitting)
+
+            X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
+            y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
+
+            cv_threshold, X_train, y_train = make_dummy_cv(
+                    X_train=X_train,
+                    y_train=y_train,
+                    X_val=X_val_threshold,
+                    y_val=y_val_threshold)
+        else:
+
+            # if cv_threshold is given, we find the optimal threshold
+            # on each fold and output the average value for the threshold
+
+            if (X_val_threshold is not None):
+                logger.log_and_throw_warning((
+                        "X_val_threshold is set "
+                        "but cv_threshold will be used"))
+
+            if isinstance(cv_threshold, int):
+                cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
+                    .split(X=X_train, y=y_train)
+
+            refit = True
+
+        thresholds = []
+
+        for train_inds, val_inds in cv_threshold:
+
+            print("----- In cv threshold fold")
+
+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
+                cv_slice_dataset(X=X_train,
+                                 y=y_train,
+                                 train_inds=train_inds,
+                                 test_inds=val_inds)
+
+            estimator.fit(X_train_fold, y_train_fold)
+
+            proba_val = estimator.predict_proba(X_val_fold)[:, 1]
+
+            threshold = get_optimal_proba_threshold(score_func=score_func,
+                                                    y_true=y_val_fold,
+                                                    proba=proba_val)
+
+            thresholds.append(threshold)
+
+            print("----- Threshold:", threshold)
+
+        scores["test_threshold"].append(np.mean(thresholds))
+
+        if refit:
+
+            estimator.fit(X_train, y_train)
+
+            proba_val = estimator.predict_proba(X_val)[:, 1]
+
+        proba_train = estimator.predict_proba(X_train)[:, 1]
+
+        pred_train = (proba_train >= threshold)
+        pred_val = (proba_val >= threshold)
+
+        train_score = score_func(y_train, pred_train)
+        test_score = score_func(y_val, pred_val)
+
+        for metric_name, metric in additional_metrics.items():
+            scores["train_" + metric_name].append(metric(y_train, pred_train))
+            scores["test_" + metric_name].append(metric(y_val, pred_val))
+
+        scores["train_score"].append(train_score)
+        scores["test_score"].append(test_score)
+
+        return scores
+
+    else:
+
+        if isinstance(cv, int):
+            cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
+
+        cv_threshold = cv_threshold or []
+
+        for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
+
+            print("=== In cv fold")
+
+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
+                cv_slice_dataset(X=X_train,
+                                 y=y_train,
+                                 train_inds=train_inds,
+                                 test_inds=val_inds)
+
+            scores = cross_validate_with_optimal_threshold(
+                    estimator=estimator,
+                    score_func=score_func,
+                    X_train=X_train_fold,
+                    y_train=y_train_fold,
+                    X_val=X_val_fold,
+                    y_val=y_val_fold,
+                    cv_threshold=cv_fold,
+                    additional_metrics=additional_metrics,
+                    threshold_set=threshold_set,
+                    scores=scores)
+
+            print("=== scores:", scores)
+
+        return scores
+
+
+if __name__ == "__main__":
+
+    from sklearn.metrics import accuracy_score, precision_score
+    from sklearn.datasets import load_breast_cancer
+    from xgboost import XGBRFClassifier
+    from sklearn.model_selection import train_test_split
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    X_train, X_val, y_train, y_val = train_test_split(X, y)
+
+    estimator = XGBRFClassifier()
+
+    score_func = accuracy_score
+
+    additional_metrics = {"precision": precision_score}
+
+    averaged_scores = []
+    averaged_thresholds = []
+
+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=None,
+            y_val_threshold=None,
+            cv=None,
+            cv_threshold=None,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    X_train, X_val_threshold, y_train, y_val_threshold =\
+        train_test_split(X_train, y_train)
+
+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=None,
+            cv_threshold=None,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=None, cv_threshold=3 \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=None,
+            cv_threshold=3,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=3, cv_threshold=None \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=3,
+            cv_threshold=None,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=3,
+            cv_threshold=[3, 3, 3],
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    # TODO: check overwriting X_train,
+    # additional metrics append instead of overwrite
+    # check the length of cv_threshold
+    # test custom cv, cv_threshold
+
+    print("\n Averaged test score:", averaged_scores)
+    print("\n Averaged threshold:", averaged_thresholds)

+ 97 - 0
cdplib/ml_validation/expanding_cv.py

@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Dec  9 09:55:52 2020
+
+@author: tanya
+"""
+
+from typing import Union, Iterable, Tuple, List
+import pandas as pd
+import numpy as np
+from itertools import accumulate, repeat, takewhile
+
+from cdplib.log import Log
+
+
+def make_expanding_cv(test_proportion: float,
+                      start_train_proportion: float,
+                      step_proportion: float = None,
+                      expanding_test_size: bool = False,
+                      data_set_size: Union[float, None] = None,
+                      index: Union[pd.Series, np.ndarray, list, None] = None)\
+        -> Union[Iterable[Tuple[List]], None]:
+    """
+
+    """
+    logger = Log("make_expanding_cv:")
+
+    try:
+        assert((index is None) != (data_set_size is None)),\
+            "Set index or data_set_size"
+
+        index = index if (index is not None)\
+            else pd.Series(range(data_set_size))
+
+        data_set_size = data_set_size or len(index)
+
+        start_train_size = int(start_train_proportion * data_set_size)
+        step_size = int(step_proportion * data_set_size)
+
+        test_size = int(test_proportion * data_set_size)
+
+        train_inds_set = (list(range(train_size))
+                          for train_size in
+                          takewhile(
+                                  lambda x: x <= data_set_size - test_size,
+                                  accumulate(repeat(start_train_size),
+                                             lambda x, _: x + step_size)))
+
+        for train_inds in train_inds_set:
+
+            if expanding_test_size:
+
+                yield (index[train_inds],
+                       index[train_inds[-1] + 1:
+                             train_inds[-1] + 1
+                             + int(test_proportion*len(train_inds))])
+
+            else:
+
+                yield (index[train_inds],
+                       index[train_inds[-1] + 1:
+                             train_inds[-1] + 1 + test_size])
+
+    except Exception as e:
+        logger.log_and_raise_error(("Failed to make expanding cv. "
+                                    "Exit with error: {}".format(e)))
+
+
+if __name__ == "__main__":
+
+    logger = Log("Test_expanding_cv: ")
+
+    logger.info("Start Testing")
+
+    logger.info("Testing expanding cv: ")
+
+    cv = make_expanding_cv(data_set_size=50,
+                           test_proportion=0.1,
+                           start_train_proportion=0.6,
+                           step_proportion=0.1,
+                           expanding_test_size=True)
+
+    cv = list(cv)
+
+    logger.info("Testing expanding cv with datetime index")
+
+    cv = make_expanding_cv(
+            test_proportion=0.1,
+            start_train_proportion=0.6,
+            step_proportion=0.1,
+            index=pd.date_range(start=pd.to_datetime("2020-01-01"),
+                                periods=50))
+
+    cv = list(cv)
+
+    logger.info("Finish testing")

+ 789 - 0
cdplib/pipeline_selector/PipelineSelector.py

@@ -0,0 +1,789 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 14:23:23 2020
+
+@author: tanya
+@description: an abstract class for selecting a machine learning
+ pipeline from a space (deterministic or random) of parameter distributions
+ over multiple pipelines.
+ The selection is thought in such a way that a Trials object is being
+ maintained during the tuning process from which one can retrieve
+ the best pipeline so far
+ as well as the entire tuning history if needed.
+ Methods configure_cross_validation and configure_result_saving
+ allow to use a custom cross-validation method and
+ save the current best result in a file or database during training.
+ Children classes: hyperopt and custom gridsearch.
+"""
+
+import pickle
+import os
+import sys
+import time
+import datetime
+import numpy as np
+import pandas as pd
+from copy import deepcopy
+from abc import ABC, abstractmethod, abstractproperty
+from typing import Callable, Optional, TypedDict,\
+    Literal, Dict, Iterable, List, Tuple, Union
+import functools
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import cross_validate as sklearn_cross_validation
+from sklearn.metrics import make_scorer
+from hyperopt import STATUS_OK, STATUS_FAIL
+from cdplib.log import Log
+from cdplib.utils import ExceptionsHandler
+from cdplib.utils import LoadingUtils
+from cdplib.ml_validation import CVComposer
+
+sys.path.append(os.getcwd())
+
+
+class SpaceElementType(TypedDict):
+    name: str
+    pipeline: Pipeline
+    params: dict
+
+
+class PipelineSelector(ABC):
+    """
+    An abstract class for selecting a machine learning
+    pipeline from a space (deterministic or random) of parameter
+    distributions over multiple pipelines.
+    The selection is though in such a way that a Trials object is being
+    maintained during the tuning process from which one can retrieve
+    the best pipeline so far as well as the entire tuning history
+    if needed.
+    Methods configure_cross_validation and configure_result_saving
+    allow to use a custom cross-validation method and
+    save the current best result in a file or database during training.
+    Children classes: hyperopt and custom gridsearch.
+    """
+    def __init__(self,
+                 cost_func: Union[Callable, str],
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: Optional[int] = None,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 additional_averaging_funcs:
+                     Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"):
+        """
+        :param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from scratch.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
+
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores of the cost_func.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to keep track of
+            in the trials of the form {"metric_name": metric}.
+
+        :param additional_averaging_funcs: functions used to aggregate
+            the output of the cross_validate function.
+            The output always contains the scores of the cost_func,
+            additional_metrics (if it is not empty),
+            but it can also contain additional information
+            (like probability threshold for example)
+            if different from cross_val_averaging_func.
+            Of the form {"metric_name": averaging_func}
+
+            Remark:
+
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+        self._logger = Log("PipelineSelector: ",
+                           stdout_log_level=stdout_log_level)
+
+        try:
+
+            ExceptionsHandler(self._logger)\
+                .assert_is_directory(path=trials_path)
+
+            self.attached_space = False
+            self.attached_data = False
+            self.configured_cross_validation = False
+            self.configured_summary_saving = False
+
+            self._cost_func = cost_func
+            # score factor is 1 when cost_func is minimized,
+            # -1 when cost func is maximized
+            self._score_factor = (not greater_is_better) - greater_is_better
+            self.trials_path = trials_path
+            self._backup_trials_freq = backup_trials_freq
+            self._strategy_name = strategy_name
+            self._data_path = None
+            self._cv_path = None
+
+            self._X = None
+            self._y = None
+            self._cv = None
+            self._space = None
+
+            # if cross-valition is not configured,
+            # sklearn cross-validation method is taken by default
+            self._cross_validation = sklearn_cross_validation
+
+            # if a trials object already exists at the given path,
+            # it is loaded and the search is continued. Else,
+            # the search is started from the beginning.
+            if os.path.isfile(self.trials_path):
+
+                with open(self.trials_path, "rb") as f:
+                    self._trials = pickle.load(f)
+
+                self._start_iteration = self.number_of_trials
+
+                self.best_score = self.best_trial_score
+
+                self._logger.info(("Loaded an existing trials object"
+                                   "Consisting of {} trials")
+                                  .format(self._start_iteration))
+
+            else:
+                self._logger.warning(("No existing trials object was found, "
+                                      "Starting from scratch."))
+
+                self._trials = None
+                self._start_iteration = 0
+                self.best_score = np.nan
+
+            # keeping track of the current search iteration
+            self._iteration = self._start_iteration
+            self._score_improved = False
+
+            self.start_tuning_time = datetime.datetime.today()
+            self.total_tuning_time = None
+            self.finished_tuning = False
+
+        except Exception as e:
+            err = ("Failed to initialize the class. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _backup_trials(self) -> None:
+        '''
+        Pickles (Saves) the trials object in binary format.
+        '''
+        try:
+            with open(self.trials_path, "wb") as f:
+                pickle.dump(self._trials, f)
+
+        except Exception as e:
+            err = "Could not backup trials. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    def configure_cross_validation(self,
+                                   cross_validation: Callable,
+                                   kwargs: dict = None) -> None:
+        """
+        Method for attaching a custom cross-validation function
+
+        :param cross_validation: a function that has the same
+             signature as sklearn.model_selection.cross_validate
+        """
+        try:
+            kwargs = kwargs or {}
+
+            self._cross_validation = functools.partial(
+                    self._cross_validation, **kwargs)
+
+            self.configured_cross_validation = True
+
+            self._logger.info("Configured cross validation")
+
+        except Exception as e:
+            err = ("Failed to configure cross-validation. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def configure_cross_validation_from_module(self,
+                                               module_path: str,
+                                               name: str) -> None:
+        """
+        Attaches a cross-validation funciton defined in
+        a different python model. This function must have
+        the same signature as sklearn.model_seclection.cross_validate
+
+        :param str module_path: path to python module
+            where the cross_validation function is defined.
+
+        :param str name: name of the cross validation function
+            loaded froma python module.
+        """
+        try:
+            self._cross_validation = \
+                LoadingUtils().load_from_module(
+                        module_path=module_path, name=name)
+
+            self.configured_cross_validation = True
+
+            self._logger.info("Configured cross validation")
+
+        except Exception as e:
+            err = ("Failed to load cross-validation from module. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def attach_space(self, space) -> None:
+        """
+        Method for attaching the pipeline/hyperparameter space
+        over which the score_func is optimized.
+
+        :param space: space where
+            the search is performed. A space might be either
+            a list of dictionaries or a hyperopt space object
+            the elements of which are dictionaries with keys:
+            name, pipeline, params
+        """
+        try:
+            self._space = space
+
+            self.attached_space = True
+
+            self._logger.info("Attached parameter distribution space")
+
+        except Exception as e:
+            err = ("Failed to attach space. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def attach_space_from_module(self, module_path: str, name: str) -> None:
+        """
+        Attaches a space defined in a different python module.
+
+        :param str module_path: path to python module
+            where the space is defined.
+
+        :param str name: name of the space loaded from
+            a python module.
+        """
+        try:
+            self._space = LoadingUtils().load_from_module(
+                    module_path=module_path, name=name)
+
+            self.attached_space = True
+
+            self._logger.info("Attached parameter distribution space")
+
+        except Exception as e:
+            err = ("Failed to attach space from module. "
+                   "Exit with error {}".format(e))
+
+            self._logger.loger_and_raise_error(err)
+
+    def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
+                    y_train: Optional[pd.DataFrame, pd.Series, np.ndarray]
+                    = None,
+                    X_val: Optional[pd.DataFrame, np.ndarray]
+                    = None,
+                    y_val: Optional[pd.DataFrame, pd.Series, np.ndarray]
+                    = None,
+                    cv: Optional[Iterable[Tuple[List[int], List[int]]]]
+                    = None) -> None:
+        '''
+        :param array X_train: data on which
+            machine learning pipelines are trained
+
+        :param array y_train: optional, vector with targets,
+            (None in case of unsupervided learning)
+
+        :param array X_val: optional, validation data.
+            When not provided, cross-validated value
+            of the cost_func is calculated.
+
+        :param array y_val: optional, validation targets
+
+        :param list cv: iterabe of tuples containing
+            train and validation indices or an integer representing
+            the number of folds for a random split of data
+            during cross-validation
+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
+        '''
+        try:
+            assert((cv is None) == (X_val is not None)),\
+                "Either cv or X_val must be provided"
+
+            if cv is None:
+
+                assert((y_val is None) == (y_train is None)),\
+                    "y_train and y_val must be simultanious"
+
+                # Here we create a trivial cv object
+                # with one validation split.
+                cv = CVComposer.dummy_cv()
+
+
+
+
+
+                train_inds = list(range(len(X_train)))
+                val_inds = list(range(len(X_train),
+                                      len(X_train) + len(X_val)))
+
+                self._cv = [(train_inds, val_inds)]
+
+                self._X = np.concatenate([X_train, X_val])
+                self._y = None if y_train is None\
+                    else np.concatenate([y_train, y_val])
+
+            else:
+
+                self._cv = cv
+                self._X = X_train
+                self._y = y_train
+
+            self.attached_data = True
+
+            self._logger.info("Attached data")
+
+        except Exception as e:
+            err = ("Failed to attach data. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def attach_data_from_hdf5(self,
+                              data_hdf5_store_path: str,
+                              cv_pickle_path: str = None) -> None:
+        """
+        Method for attaching data from a hdf5 store
+         and a cv object from a pickled file.
+
+         The hdf5 store is a binary file,
+         after loading it, it is a dictionary with keys
+         X_train (y_train, X_val, y_val).
+
+         The cv is loaded from a pickle file.
+
+         The reason to separate the data
+         store from the cv store, is the hdf5 is optimized to
+         store large dataframes (especially with simple types) and
+         a a small list of lists like a cv-object is better
+         to be stored as a pickle file.
+
+        :param str data_hdf5_store_path: path to the hdf5 store
+            with train and validation data
+        :param str cv_pickle_path: path to the pickle file with
+            the cv data
+        """
+        try:
+            assert(os.path.isfile(data_hdf5_store_path)),\
+                "Parameter hdf5_store_path is not a file"
+
+            store = pd.HDFStore(data_hdf5_store_path)
+
+            self._data_path = data_hdf5_store_path
+
+            data_input = {key: store["key"] if key in store else None
+                          for key in ["X_train", "y_train", "X_val", "y_val"]}
+
+            if cv_pickle_path is not None:
+
+                assert(os.path.isfile(cv_pickle_path)),\
+                    "Parameter cv_pickle_path is not a file"
+
+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
+
+                self._cv_path = cv_pickle_path
+
+            else:
+                data_input["cv"] = None
+
+            self.attach_data(**data_input)
+
+            store.close()
+
+        except Exception as e:
+            err = "Failed to attach data. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def default_summary(self) -> dict:
+        """
+        Default summary of the strategy.
+        Every the _objective function is called
+        the current score and the information
+        about the tested space element is added to the
+        summary and it is saved to the Trials.
+        If summary saving is configured it is also
+        saved to a file, or a database when the score improves.
+        """
+        summary = {}
+
+        if self._strategy_name is not None:
+            summary["strategy_name"] = self._strategy_name
+
+        if isinstance(self._cost_func, str):
+            summary["cost_func"] = self._cost_func
+
+        elif hasattr(self._cost_func, "__name__"):
+            summary["cost_func"] = self._cost_func.__name__
+
+        summary["trials_path"] = self.trials_path
+
+        if self._data_path is not None:
+            summary["data_path"] = self._data_path
+
+        if self._cv_path is not None:
+            summary["cv_path"] = self._cv_path
+
+        summary["start_tuning_time"] = self.start_tuning_time
+
+        summary["iteration"] = self._iteration
+
+        return summary
+
+    def configer_summary_saving(self,
+                                save_method: Callable
+                                = functools.partial(
+                                        pd.DataFrame.to_excel,
+                                        **{"path_or_buf": "result.csv"}),
+                                kwargs: Optional[dict] = None) -> None:
+        """
+        When the score calculated by _objective function improves,
+        the default summary is updated with information about the
+        current score and pipeline/hyperparameters
+        and can be saved to a file or database, depending
+        on the configured save_method.
+
+        :param Callable save_method: method for saving the result
+            of the pipeline selection. The method must accept
+            a pandas DataFrame as argument.
+            By default, saving to an excel file.
+
+            Examples:
+                functools.partial(pd.DataFrame.to_csv,
+                                  **{"path_or_buf": <PATH>})
+                functools.partial(np.savetxt, **{"fname": <PATH>})
+
+                functools.partial(SQLHandler(<URI>).append_to_table,
+                                  **{"tablename": <NAME>})
+
+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
+                                  **{"collection_name": <NAME>})
+
+            using functools can be avoided by providing the kwarg argument
+
+        :param dict kwargs: a dictionary with keyword arguments
+            (like tablename) to provide to the save_method
+        """
+        try:
+            kwargs = kwargs or {}
+
+            self._save_method = functools.partial(save_method, **kwargs)
+
+            self.configured_summary_saving = True
+
+            self._logger.info("Configured summary saving")
+
+        except Exception as e:
+            err = ("Failed to configure the summary saving. "
+                   "Exit with error {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _save_summary(self, summary: dict) -> None:
+        """
+        When the score calculated by _objective function improves,
+        the default summary is updated with information about the
+        current score and pipeline/hyperparameters
+        and can be saved to a file or database, depending
+        on the configured save_method.
+        """
+        try:
+            assert(self.configured_summary_saving),\
+                "Result saving must be configured first"
+
+            self._save_method(summary)
+
+        except Exception as e:
+            err = ("Could not configure summary saving. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
+        """
+        Calculates the averaged cross-validated score and score variance,
+        as well as the averaged values and variances of the additional metrics.
+
+        This method is called in the _objective function that is
+        passed to the hyperopt optimizer.
+
+        This function can be overriden, when the cost
+        needs to be calculated differently,
+        for example with a tensorflow model.
+
+        :param Pipeline pipeline: machine learning pipeline
+            that will be evaluated with cross-validation
+
+        :return: dictionary with the aggregated
+            cross-validation scores and
+            the score variances for the scores in the output
+            of the cross-validation function.
+
+            form of the output:
+                {"score": 10, #score used in optimization,
+                 "score_variance": 0.5
+                 "additional_metric1": 5,
+                 "additional_metric1_variance": 7}
+
+            a custom cross-validation function can also include for
+            example probability threshold for each fold, then
+            the output of this function will include the average
+            value and the variance of the probability threshold
+            over the folds.
+        """
+        try:
+            scoring = {"score": make_scorer(self.cost_func)}
+
+            scoring.update({metric_name: make_scorer(metric)
+                            for metric_name, metric
+                            in self._additional_metrics.items()})
+
+            scores = self._cross_validation(
+                    estimator=pipeline,
+                    X=self._X,
+                    y=self._y,
+                    cv=self._cv,
+                    scoring=self._scoring,
+                    error_score=np.nan)
+
+            averaging_funcs = {
+                    metric_name: self._additional_averaging_funcs[metric_name]
+                    if metric_name in self._additional_averaging_funcs
+                    else self._cross_val_averaging_func
+                    for metric_name in scores}
+
+            scores_average = {
+                    metric_name.replace("test_", ""):
+                    averaging_funcs[metric_name](scores[metric_name])
+                    for metric_name in scores
+                    if metric_name.startswith("test")}
+
+            scores_variance = {
+                    metric_name.replace("test_", "") + "_variance":
+                    np.var(scores[metric_name])
+                    for metric_name in scores
+                    if metric_name.startswith("test")}
+
+            return {**scores_average, **scores_variance}
+
+        except Exception as e:
+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
+
+            self._logger.log_and_raise_error(err)
+
+    def _objective(self, space_element: SpaceElementType) -> dict:
+        '''
+        This method is called in run_trials method
+        that is using the hyperopt fmin opmizer.
+
+        Uses _evaluate method.
+
+        It must take as input a space element
+        and produce an output in the form of dictionary
+        with 2 obligatory values loss and status
+        (STATUS_OK or STATUS_FAIL). Other
+        values in the output are optional and can be
+        accessed later through the trials object.
+
+        :Warning: fmin minimizes the loss,
+        when _evaluate returns a value to be maximized,
+        it is multiplied by -1 to obtain loss.
+
+        :param SpaceElementType space_element: element
+            of the space over which the optimization is done
+
+        :output: dictionary with keys
+            loss (minimized value),
+            status with values STATUS_OK or STATUS_FAIL
+            uderstood by hyperopt,
+            score (equal to loss or -loss),
+            score_variance,
+            timestamp (end of execution),
+            train_time: execution time
+            and other keys given in self.default_summary
+        '''
+        try:
+            start_time = time.time()
+
+            assert(self.attached_data),\
+                ("Data must be attached in order "
+                 "in order to effectuate the best"
+                 "pipeline search")
+
+            summary = deepcopy(self.default_summary)
+
+            # backup the current trials if the score improved
+            # at previous iteration or every ith iteration
+            # if the backup_trials_freq is set
+            backup_cond = ((self._backup_trials_freq is not None) and
+                           ((self._iteration - self._start_iteration - 1) %
+                            self._backup_trials_freq == 0)) or\
+                self._score_improved
+
+            if backup_cond:
+                self._backup_trials()
+                self._score_improved = False
+
+            pipeline = space_element['pipeline']
+            params = space_element['params']
+            pipeline.set_params(**params)
+
+            self._logger.info(("Iteration {0}: "
+                               "Current score is {1}: "
+                               "Training pipeline {2} "
+                               "with parameters: {3}. ").format(
+                                  self._iteration,
+                                  self.best_score,
+                                  space_element['name'],
+                                  params))
+
+            result = self._evaluate(pipeline)
+
+            summary.update(result)
+
+            end_time = time.time()
+
+            summary['status'] = STATUS_OK
+            summary.update(result)
+            summary['loss'] = self._score_factor * summary['score']
+            summary['timestamp'] = datetime.datetime.today()
+            summary['train_time'] = end_time - start_time
+
+            self._iteration += 1
+
+            self._score_improved = (self.best_score != self.best_score) or\
+                                   (self._score_factor*result["score"] <
+                                    self._score_factor*self.best_score)
+
+            if self._score_improved:
+
+                self._logger.info("Score improved, new best score is: {}"
+                                  .format(result["score"]))
+
+                self.best_score = result['score']
+
+                if self.configured_summary_saving:
+                    self._save_summary(summary)
+
+        except Exception as e:
+
+            self._logger.warning("Trial failed with error {}".format(e))
+
+            summary = {}
+            summary['status'] = STATUS_FAIL
+            summary['timestamp'] = datetime.datetime.today()
+            summary['error'] = e
+            for key in ['loss', 'score', 'score_variance', 'train_time']:
+                summary[key] = np.nan
+
+        return summary
+
+    @abstractmethod
+    def run_trials(self):
+        """
+        Method that runs the hyperparameter tuning over possibly multiple
+        pipeline types specified in self.space
+        When run_trials method is finished the flag self.finished_tuning
+        should be set to True and the methods self._backup_trials and
+        optionally self._save_result should be called.
+        """
+        pass
+
+    @abstractproperty
+    def number_of_trials(self) -> int:
+        """
+        Number of trials already run in the current trials object
+        """
+        pass
+
+    @abstractproperty
+    def best_trial(self) -> dict:
+        """
+        Best trial sor far.
+         Should contain the status, pipeline,
+         hyperparameters, and the score (loss).
+         Other information is otional and is defined
+         by self.default_summary
+        """
+        pass
+
+    @abstractproperty
+    def best_trial_score(self) -> float:
+        """
+        Score of the best pipeline with the best hyperparameters
+        """
+        pass
+
+    @abstractproperty
+    def best_trial_score_variance(self) -> float:
+        """
+        Variance of the cross-validation score of the best pipeline
+        """
+        pass
+
+    @abstractproperty
+    def best_trial_pipeline(self) -> Pipeline:
+        """
+        Best pipeline with best hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def get_n_best_trial_pipelines(self, n: int) -> list:
+        """
+        N best pipelines with corresponding
+        best hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
+        """
+        If the hyperparameter search is done over multiple
+        pipelines, then returns n different pipeline-types
+        with corresponding hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def trials_to_excel(self, path: str) -> None:
+        """
+        Trials object in the shape of table written to excel,
+        should contain the iteration, pipeline (as str),
+        hyperparamters (as str), self.best_result (see self._objective method)
+        as well as additional information defined by self.default_summary
+        """
+        pass

+ 0 - 0
cdplib/unit_tests/TestFlattenData.py


+ 0 - 0
cdplib/unit_tests/TestLog.py


+ 0 - 0
cdplib/unit_tests/TestMongodbHandler.py


+ 0 - 0
cdplib/unit_tests/invalid_test_schema.json


+ 0 - 0
cdplib/unit_tests/valid_test_schema.json


+ 21 - 10
cdplib/utils/CleaningUtils.py

@@ -8,13 +8,16 @@ Created on Fri Sep 27 16:20:03 2019
 
 import pandas as pd
 import numpy as np
+from typing import Union, List
 
 
 class CleaningUtils:
     '''
     Unites different methods for data cleaning
     '''
-    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
+    def convert_dates(self,
+                      series: pd.Series,
+                      formats: Union[str, List[str]]) -> pd.Series:
         '''
         Converts values from string to date in a pandas Series
          where possibly multiple date formats are mixed
@@ -29,8 +32,7 @@ class CleaningUtils:
 
                 series = series.astype(str)
 
-                series.loc[missing_leading_zero] = "0" +\
-                    series.loc[missing_leading_zero]
+                series.loc[missing_leading_zero] += "0"
 
             converted_this_format = pd.to_datetime(series,
                                                    format=formt,
@@ -71,21 +73,28 @@ class CleaningUtils:
 
         return s
 
-    def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
+    def melt_duplicated_columns(self, df: pd.DataFrame,
+                                suffix: str = "",
+                                prefix: str = "") -> pd.DataFrame:
         '''
         If a dataframe has multiple columns with the same name
          (up to a prefix or a suffix),
          melts the columns together in one
 
-        :parame suffix: string or regex up to which we consider names as duplicated
-        :parame prefix: string or regex up to which we consider names as duplicated
+        :parame suffix: string or regex up
+            to which we consider names as duplicated
+        :parame prefix: string or rege
+            up to which we consider names as duplicated
         '''
         from collections import Counter
 
         import re
 
-        # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
-        df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
+        # remove the suffix and the prefix from the column names
+        # (now the duplicates are truely duplicates)
+        df.columns = [re.sub(re.compile(prefix), "",
+                             re.sub(re.compile(suffix), "", c))
+                      for c in df.columns]
 
         column_counter = Counter(df.columns)
 
@@ -100,10 +109,12 @@ class CleaningUtils:
             df_melted = []
 
             for dup_var in dup_vars:
-                dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
+                dup_var_melted = pd.melt(frame=df,
+                                         id_vars=id_vars,
+                                         value_vars=[dup_var],
+                                         value_name=dup_var)\
                                    .set_index(id_vars)[dup_var]
 
                 df_melted.append(dup_var_melted)
 
             return pd.concat(df_melted, axis=1, sort=False).reset_index()
-

+ 28 - 18
cdplib/utils/ExceptionsHandler.py

@@ -8,35 +8,45 @@ Created on Fri Sep 27 14:20:58 2019
 
 import os
 import sys
-import logging
 import pandas as pd
+from cdplib.log import Log
+
 sys.path.append(os.getcwd())
 
 
 class ExceptionsHandler:
     '''
     '''
-    def __init__(self):
+    def __init__(self, logger: Log = None):
         '''
         '''
+        self._logger = logger or Log("ExceptionHandler")
 
-    def check_is_file(self, path, logger=None):
+    def check_is_file(self, path: str):
         '''
         '''
-        if logger is None:
-            logger = logging.getLogger()
-
         if not os.path.isfile(path):
             err = "File {} not found".format(path)
-            logger.error(err)
+            self._logger.error(err)
             raise FileNotFoundError(err)
 
-    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
-                               error_or_warning: str, logger = None):
+    def assert_is_directory(self, path: str):
+        ""
+        ""
+        assert(isinstance(path, str)),\
+            "Parameter 'path' must of str type"
+
+        dirname = os.path.dirname("path")
+
+        if len(dirname) > 0:
+            os.mkdir(dirname, exists_ok=True)
+
+    def _check_column_abscence(self,
+                               columns: (str, list),
+                               data: pd.DataFrame,
+                               error_or_warning: str):
         '''
         '''
-        if logger is None:
-            logger = logging.getLogger()
         if isinstance(columns, str):
             columns = [columns]
 
@@ -44,23 +54,23 @@ class ExceptionsHandler:
 
             if column not in data.columns:
                 err = ("{} is not an internal column name".format(column))
-                getattr(logger, error_or_warning)(err)
+                getattr(self._logger, error_or_warning)(err)
 
                 if error_or_warning == "error":
                     raise Exception(err)
 
-    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
+    def error_column_abscence(self,
+                              columns: (str, list),
+                              data: pd.DataFrame):
         '''
         '''
         return self._check_column_abscence(columns=columns,
                                            data=data,
-                                           error_or_warning="error",
-                                           logger=logger)
+                                           error_or_warning="error")
 
-    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
         '''
         '''
         return self._check_column_abscence(columns=columns,
                                            data=data,
-                                           error_or_warning="warning",
-                                           logger=logger)
+                                           error_or_warning="warning")

+ 46 - 0
cdplib/utils/LoadingUtils.py

@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct  1 12:58:58 2020
+
+@author: tanya
+@description: class for methods of loading data from external sources
+"""
+
+import os
+import sys
+from cdplib.log import Log
+
+
+class LoadingUtils:
+    """
+    """
+    def __init__(self, logger=None):
+        """
+        """
+        self._logger = logger or Log("LoadingUtils")
+
+    def load_from_module(self, module_path: str, name: str):
+        """
+        """
+        for p in ["modele_path", "name"]:
+            assert(isinstance(p, str)),\
+                "Parameter '{}' must be of str type".format(p)
+
+            assert(os.path.isfile(module_path)),\
+                "Parameter 'module_path' must be a valid file"
+
+            module, extension = os.path.splitext(os.path.basename(module_path))
+
+            assert(extension == ",py"),\
+                "Parameter 'space' must be read from a python file"
+
+            sys.path.insert(module_path)
+
+            try:
+                from module import name
+                return name
+
+            except ImportError:
+                err = "Invalid space location or name"
+                self._logger.log_and_raise_error(err)

+ 36 - 0
cdplib/utils/TypeConverter.py

@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr 24 09:06:13 2020
+
+@author: tanya
+"""
+
+import numpy as np
+import pandas as pd
+
+class TypeConverter:
+    """
+    Library for methods to manage python types
+    """
+    def __init__(self):
+        """
+        """
+        from cdplib.log import Log
+
+        self._logger = Log("TypeConverter")
+
+    def convert_to_ndarray(self, x: (pd.DataFrame, np.ndarray)) -> np.ndarray:
+        '''
+        Converts an DataFrame to an numpy array.
+        '''
+        if isinstance(x, np.ndarray):
+            return x
+
+        elif (isinstance(x, pd.core.frame.DataFrame))\
+                or (isinstance(x, pd.core.series.Series)):
+            return x.values
+
+        else:
+            self._logger.log_and_raise_error_stack_info(
+                    'The argument must be a numpy array or a pandas DataFrame')

+ 0 - 0
cdplib/utils/__init__.py


+ 0 - 0
classes.png


+ 0 - 0
hooks/README.txt


+ 0 - 0
hooks/pre-commit


+ 0 - 0
packages.png


+ 0 - 0
setup.py


+ 115 - 0
tests/testSQLOperations.py

@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 19 14:34:22 2018
+
+@author: tanya
+"""
+
+import os
+import unittest
+import pandas as pd
+
+from libraries.database_operations_library import SQLOperations
+
+
+class TestSQLOperations(unittest.TestCase):
+    '''
+    '''
+    def __init__(self, test_df = None):
+        print('\n', '='*5, 'Testing class : SQLOperations', '='*5)
+        self.inst = SQLOperations(db_url = None)
+        print('Connected to', self.inst.db_url)
+        
+        if test_df is None:
+            self.test_df = pd.DataFrame({'a' : [1,2,3,4,5], 'b' : ['A', 'B', 'C', 'A', 'V'], 'c' : [0.1, 0.2, 0.3, 0.4, 0.5]})
+        else:
+            self.test_df = test_df
+
+        
+    def _create_test_table(self, test_tablename, create_table_query = None):
+        '''
+        '''
+        self.inst.drop_table_if_exists(test_tablename)        
+        
+        if create_table_query is None:
+            if 'ibm_db' in self.inst.db_url:
+                create_table_query = """CREATE TABLE {} (
+                                        a INT,
+                                        b CHAR,
+                                        c DECIMAL(10 , 2 )
+                                        );""".format(test_tablename)
+            else:
+                create_table_query = """CREATE TABLE test (
+                                        a INT,
+                                        b TEXT,
+                                        c DECIMAL(10 , 2 )
+                                        );"""
+                
+        self.inst.execute(create_table_query)
+
+
+        
+class TestExecute(TestSQLOperations):
+    '''
+    '''
+    def __init__(self):
+        super(TestExecute, self).__init__()
+        print('\n', '-'*2, 'Testing method : execute')
+        
+    def test_create_table(self, test_tablename, create_table_query = None):
+        '''
+        '''
+        print('-'*4, 'Testing create table operation')
+        self._create_test_table(test_tablename = test_tablename, create_table_query = create_table_query)
+        self.assertTrue(self.inst.check_if_table_exists(test_tablename))
+        self.inst.drop_table_if_exists(test_tablename)
+        print('-'*4, 'Test ran successfully!')
+        
+class TestLoad_csv_to_db(TestSQLOperations):
+    '''
+    '''
+    def __init__(self):
+        super(TestLoad_csv_to_db, self).__init__()
+        print('\n', '-'*2, 'Testing method : load_csv_to_db')
+    
+    def test_correct_content(self, test_csv_path, test_tablename, create_table_query = None):
+        '''
+        '''
+        print('-'*4, 'Testig that load operation gives correct result')
+        os.makedirs(os.path.dirname(test_csv_path), exist_ok = True)
+        if not self.inst.drop_table_if_exists(test_tablename):
+            self._create_test_table(test_tablename)
+    
+        self.test_df.to_csv(test_csv_path, index = False)
+        self.inst.load_csv_to_db(csv_path = test_csv_path, tablename = test_tablename)
+        try:
+            connection = self.inst.engine.connect()
+            test_df_from_sql = pd.read_sql(sql = "SELECT * FROM test", con = connection)
+            connection.close()
+        except Exception as e:
+            raise Exception('ERROR: test csv file has not been load to sql at all, \n, exit with {}'.format(e))
+        
+        print('-'*4, 'Testing data has correct shape')
+        self.assertTupleEqual(self.test_df.shape, test_df_from_sql.shape)
+        
+        print('-'*4,'Testing data has correct columns')
+        self.assertSetEqual(set(self.test_df.columns), set(test_df_from_sql.columns))
+        
+        print('-'*4,'Testing data has correct content')
+        for col in self.test_df.columns:
+            test_df_from_sql[col] = test_df_from_sql[col].astype(self.test_df[col].dtype)
+        pd.testing.assert_frame_equal(self.test_df, test_df_from_sql)
+        
+        print('-'*4, 'Test ran successfully!')
+
+                        
+if __name__ == '__main__':
+    
+    test_tablename = 'test10'
+    test_csv_path = '/home/tanya/acdp/data_samples/test.csv'
+    
+    TestExecute().test_create_table(test_tablename = test_tablename)
+    TestLoad_csv_to_db().test_correct_content(test_csv_path = test_csv_path, test_tablename = test_tablename)
+    
+    print('Done!', '\n')

+ 177 - 0
tests/testStatisticalFeatures.py

@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 18 16:26:47 2018
+
+@author: tanya
+"""
+
+import os
+import unittest
+import logging
+import pandas as pd
+import numpy as np
+
+from pandas.util.testing import assert_frame_equal
+
+from libraries.feature_engineering.in_memory_feature_engineering.StatisticalFeatures import StatisticalFeatures
+from libraries.logging.logging_utils import configure_logging
+
+
+class TestStatisticalFeatures(unittest.TestCase):
+    '''
+    '''
+    def __init__(self, data = None, index_cols = None, path_to_log = None):
+        '''
+        '''        
+        if index_cols is None:
+            self.index_cols = ['id1', 'id2']
+        else:
+            self.index_cols = index_cols
+        
+        if data is None:
+            self.data = pd.DataFrame({'int' : [1,2,3,2,55,3,7],
+                                                     'float' : [0.1, 7, 0.1, 99.9, 99.9, np.nan, 7],
+                                                     'str' : ['a', np.nan, 'c', 'a', 'a', '', 'c'],
+                                                     'datetime' : [pd.datetime(2017, 1, 2), np.nan, pd.datetime(2017, 5, 3), pd.datetime(2017, 1, 4),
+                                                                   '2018-01-19', pd.datetime(2018, 1, 4), pd.datetime(2019, 3, 23)],              
+                                                     'nan' : [np.nan]*7,
+                                                     'id1' : [1,1,3,3,3,1,1],
+                                                     'id2' : ['a', 'a', 'b', 'b', 'a', 'a', np.nan]})\
+                                                     .sort_values(by = self.index_cols)
+        else:
+            self.data = data
+            
+        
+        self.obj = StatisticalFeatures(data = self.data, index_cols = self.index_cols, path_to_log = path_to_log)
+            
+class TestKpisByAggregation(TestStatisticalFeatures):
+    '''
+    '''
+    def __init__(self, data = None, index_cols = None, path_to_log = None):
+        '''
+        '''
+        super(TestKpisByAggregation, self).__init__(data = data, index_cols = index_cols, path_to_log = path_to_log)
+    
+    def test_builtin_aggfuncs_numeric_cols(self, answer = None, kpis = None):
+        '''Tests the expected behaviour of pandas builtin aggregation function,
+           in particular behaviour with missing values
+           
+           :param DataFrame data:
+           :param list index_cols:
+           :param DataFrame answer:
+           :param list of tuples or dict kpis:    
+        '''
+        kpis = kpis or [('int', ['min', 'std']),
+                        ('float', ['mean', np.sum]),
+                        ('float', 'sum'),
+                        ('nan', 'mean')]
+            
+        
+        answer = answer or pd.DataFrame([
+                            {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0]), 'float_sum' : 7.1, 'nan_mean' : np.nan},
+                            {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9]), 'float_sum' : 100, 'nan_mean' : np.nan},
+                            {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9, 'float_sum' : 99.9, 'nan_mean' : np.nan},
+                            ]).sort_values(self.index_cols).set_index(self.index_cols)
+            
+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
+        
+        assert_frame_equal(result, answer[result.columns])
+        
+        
+    def test_dict_kpi(self, kpis = None, answer = None):
+        '''
+        '''
+        kpis = kpis or {'int' : ['min', 'std'], 'float' : 'mean'}
+            
+        answer = answer or pd.DataFrame([
+                            {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0])},
+                            {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9])},
+                            {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9},
+                            ]).sort_values(self.index_cols).set_index(self.index_cols)
+                
+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
+        
+        assert_frame_equal(result, answer[result.columns])
+        
+        
+    def test_string_cols(self, kpis = None, answer = None):
+        '''
+        '''
+        kpis = kpis or {'str' : ['sum']}
+            
+        answer = answer or pd.DataFrame([
+                            {'id1' : 1, 'id2' : 'a', 'str_sum' : 'anan'},
+                            {'id1' : 3, 'id2' : 'b', 'str_sum' : 'ca'},
+                            {'id1' : 3, 'id2' : 'a', 'str_sum' : 'a'},
+                            ]).sort_values(self.index_cols).set_index(self.index_cols)
+                
+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
+        
+        assert_frame_equal(result, answer[result.columns])
+        
+        
+    def test_custom_aggfunc(self, kpis, answer = None):
+        '''
+        '''
+        
+        if kpis is None:
+            def custom_sum(x):
+                return np.sum(x)
+            
+            kpis = {'int' : custom_sum}
+        
+        answer = answer or pd.DataFrame([
+                           {'id1' : 1, 'id2' : 'a', 'int_custom_sum' : 6},
+                           {'id1' : 3, 'id2' : 'b', 'int_custom_sum' : 55},
+                           {'id1' : 3, 'id2' : 'a', 'int_custom_sum' : 5},
+                           ]).sort_values(self.index_cols).set_index(self.index_cols)
+                
+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
+        
+        assert_frame_equal(result, answer[result.columns])
+            
+        
+    def test_some_wrong_col(self, kpis = None, answer = None):
+        '''
+        '''
+        kpis = kpis or {'bla' : 'sum', 'int' : 'sum'}
+            
+        answer = answer or pd.DataFrame([
+                   {'id1' : 1, 'id2' : 'a', 'int_sum' : 6},
+                   {'id1' : 3, 'id2' : 'a', 'int_sum' : 55},
+                   {'id1' : 3, 'id2' : 'b', 'int_sum' : 5},
+                   ]).sort_values(self.index_cols).set_index(self.index_cols)
+                
+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
+                
+        assert_frame_equal(result, answer[result.columns])
+        
+    def test_all_wrong_cols(self, kpis = None, answer = None):
+        '''
+        '''
+        kpis = kpis or {'bla' : 'sum', 'blub' : 'sum'}
+            
+        result = self.obj.get_kpis_by_aggregation(kpis = kpis)
+            
+        answer = self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
+                
+        assert_frame_equal(result, answer[result.columns])
+        
+if __name__ == '__main__':
+    
+    path_to_log = os.path.join(os.environ.get('PROJECT_DIR'),
+                               'tests', 'test_feature_engineering','test_in_memory_feature_engineering',
+                               'test_kpis_by_aggregation.log')
+    
+    configure_logging(path_to_log)
+    logger = logging.getLogger(__name__)
+
+    inst = TestKpisByAggregation(path_to_log = path_to_log)
+    inst.test_builtin_aggfuncs_numeric_cols()
+    inst.test_dict_kpi()
+    inst.test_string_cols()
+    inst.test_some_wrong_col()
+    inst.test_all_wrong_cols()
+        
+    logger.info('Done testing method get_kpis_by_aggregation!')