--- a/.gitignore
+++ b/.gitignore
--- a/Pipfile
+++ b/Pipfile
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,611 +0,0 @@
 
				-{
			
 
				-    "_meta": {
			
 
				-        "hash": {
			
 
				-            "sha256": "1879ebbd4ee3fe44d9e59091889a69ead4c7b76e81b70de0dd74d12b5266cf42"
			
 
				-        },
			
 
				-        "pipfile-spec": 6,
			
 
				-        "requires": {
			
 
				-            "python_version": "3"
			
 
				-        },
			
 
				-        "sources": [
			
 
				-            {
			
 
				-                "name": "pypi",
			
 
				-                "url": "https://pypi.org/simple",
			
 
				-                "verify_ssl": true
			
 
				-            }
			
 
				-        ]
			
 
				-    },
			
 
				-    "default": {
			
 
				-        "boltons": {
			
 
				-            "hashes": [
			
 
				-                "sha256:3dd8a8e3c1886e7f7ba3422b50f55a66e1700161bf01b919d098e7d96dd2d9b6",
			
 
				-                "sha256:dd362291a460cc1e0c2e91cc6a60da3036ced77099b623112e8f833e6734bdc5"
			
 
				-            ],
			
 
				-            "version": "==20.2.1"
			
 
				-        },
			
 
				-        "cdplib": {
			
 
				-            "editable": true,
			
 
				-            "git": "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git",
			
 
				-            "ref": "623f7488557e373eb3181bb4099295ed17a53b5c"
			
 
				-        },
			
 
				-        "certifi": {
			
 
				-            "hashes": [
			
 
				-                "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c",
			
 
				-                "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"
			
 
				-            ],
			
 
				-            "version": "==2020.12.5"
			
 
				-        },
			
 
				-        "chardet": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa",
			
 
				-                "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
			
 
				-            "version": "==4.0.0"
			
 
				-        },
			
 
				-        "cloudpickle": {
			
 
				-            "hashes": [
			
 
				-                "sha256:3a32d0eb0bc6f4d0c57fbc4f3e3780f7a81e6fee0fa935072884d58ae8e1cc7c",
			
 
				-                "sha256:9bc994f9e9447593bd0a45371f0e7ac7333710fcf64a4eb9834bf149f4ef2f32"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.5'",
			
 
				-            "version": "==1.6.0"
			
 
				-        },
			
 
				-        "decorator": {
			
 
				-            "hashes": [
			
 
				-                "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760",
			
 
				-                "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"
			
 
				-            ],
			
 
				-            "version": "==4.4.2"
			
 
				-        },
			
 
				-        "future": {
			
 
				-            "hashes": [
			
 
				-                "sha256:b1bead90b70cf6ec3f0710ae53a525360fa360d306a86583adc6bf83a4db537d"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				-            "version": "==0.18.2"
			
 
				-        },
			
 
				-        "greenlet": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0a77691f0080c9da8dfc81e23f4e3cffa5accf0f5b56478951016d7cfead9196",
			
 
				-                "sha256:0ddd77586553e3daf439aa88b6642c5f252f7ef79a39271c25b1d4bf1b7cbb85",
			
 
				-                "sha256:111cfd92d78f2af0bc7317452bd93a477128af6327332ebf3c2be7df99566683",
			
 
				-                "sha256:122c63ba795fdba4fc19c744df6277d9cfd913ed53d1a286f12189a0265316dd",
			
 
				-                "sha256:181300f826625b7fd1182205b830642926f52bd8cdb08b34574c9d5b2b1813f7",
			
 
				-                "sha256:1a1ada42a1fd2607d232ae11a7b3195735edaa49ea787a6d9e6a53afaf6f3476",
			
 
				-                "sha256:1bb80c71de788b36cefb0c3bb6bfab306ba75073dbde2829c858dc3ad70f867c",
			
 
				-                "sha256:1d1d4473ecb1c1d31ce8fd8d91e4da1b1f64d425c1dc965edc4ed2a63cfa67b2",
			
 
				-                "sha256:292e801fcb3a0b3a12d8c603c7cf340659ea27fd73c98683e75800d9fd8f704c",
			
 
				-                "sha256:2c65320774a8cd5fdb6e117c13afa91c4707548282464a18cf80243cf976b3e6",
			
 
				-                "sha256:4365eccd68e72564c776418c53ce3c5af402bc526fe0653722bc89efd85bf12d",
			
 
				-                "sha256:5352c15c1d91d22902582e891f27728d8dac3bd5e0ee565b6a9f575355e6d92f",
			
 
				-                "sha256:58ca0f078d1c135ecf1879d50711f925ee238fe773dfe44e206d7d126f5bc664",
			
 
				-                "sha256:5d4030b04061fdf4cbc446008e238e44936d77a04b2b32f804688ad64197953c",
			
 
				-                "sha256:5d69bbd9547d3bc49f8a545db7a0bd69f407badd2ff0f6e1a163680b5841d2b0",
			
 
				-                "sha256:5f297cb343114b33a13755032ecf7109b07b9a0020e841d1c3cedff6602cc139",
			
 
				-                "sha256:62afad6e5fd70f34d773ffcbb7c22657e1d46d7fd7c95a43361de979f0a45aef",
			
 
				-                "sha256:647ba1df86d025f5a34043451d7c4a9f05f240bee06277a524daad11f997d1e7",
			
 
				-                "sha256:719e169c79255816cdcf6dccd9ed2d089a72a9f6c42273aae12d55e8d35bdcf8",
			
 
				-                "sha256:7cd5a237f241f2764324396e06298b5dee0df580cf06ef4ada0ff9bff851286c",
			
 
				-                "sha256:875d4c60a6299f55df1c3bb870ebe6dcb7db28c165ab9ea6cdc5d5af36bb33ce",
			
 
				-                "sha256:90b6a25841488cf2cb1c8623a53e6879573010a669455046df5f029d93db51b7",
			
 
				-                "sha256:94620ed996a7632723a424bccb84b07e7b861ab7bb06a5aeb041c111dd723d36",
			
 
				-                "sha256:b5f1b333015d53d4b381745f5de842f19fe59728b65f0fbb662dafbe2018c3a5",
			
 
				-                "sha256:c5b22b31c947ad8b6964d4ed66776bcae986f73669ba50620162ba7c832a6b6a",
			
 
				-                "sha256:c93d1a71c3fe222308939b2e516c07f35a849c5047f0197442a4d6fbcb4128ee",
			
 
				-                "sha256:cdb90267650c1edb54459cdb51dab865f6c6594c3a47ebd441bc493360c7af70",
			
 
				-                "sha256:cfd06e0f0cc8db2a854137bd79154b61ecd940dce96fad0cba23fe31de0b793c",
			
 
				-                "sha256:d3789c1c394944084b5e57c192889985a9f23bd985f6d15728c745d380318128",
			
 
				-                "sha256:da7d09ad0f24270b20f77d56934e196e982af0d0a2446120cb772be4e060e1a2",
			
 
				-                "sha256:df3e83323268594fa9755480a442cabfe8d82b21aba815a71acf1bb6c1776218",
			
 
				-                "sha256:df8053867c831b2643b2c489fe1d62049a98566b1646b194cc815f13e27b90df",
			
 
				-                "sha256:e1128e022d8dce375362e063754e129750323b67454cac5600008aad9f54139e",
			
 
				-                "sha256:e6e9fdaf6c90d02b95e6b0709aeb1aba5affbbb9ccaea5502f8638e4323206be",
			
 
				-                "sha256:eac8803c9ad1817ce3d8d15d1bb82c2da3feda6bee1153eec5c58fa6e5d3f770",
			
 
				-                "sha256:eb333b90036358a0e2c57373f72e7648d7207b76ef0bd00a4f7daad1f79f5203",
			
 
				-                "sha256:ed1d1351f05e795a527abc04a0d82e9aecd3bdf9f46662c36ff47b0b00ecaf06",
			
 
				-                "sha256:f3dc68272990849132d6698f7dc6df2ab62a88b0d36e54702a8fd16c0490e44f",
			
 
				-                "sha256:f59eded163d9752fd49978e0bab7a1ff21b1b8d25c05f0995d140cc08ac83379",
			
 
				-                "sha256:f5e2d36c86c7b03c94b8459c3bd2c9fe2c7dab4b258b8885617d44a22e453fb7",
			
 
				-                "sha256:f6f65bf54215e4ebf6b01e4bb94c49180a589573df643735107056f7a910275b",
			
 
				-                "sha256:f8450d5ef759dbe59f84f2c9f77491bb3d3c44bc1a573746daf086e70b14c243",
			
 
				-                "sha256:f97d83049715fd9dec7911860ecf0e17b48d8725de01e45de07d8ac0bd5bc378"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3'",
			
 
				-            "version": "==1.0.0"
			
 
				-        },
			
 
				-        "hyperopt": {
			
 
				-            "hashes": [
			
 
				-                "sha256:bc6047d50f956ae64eebcb34b1fd40f186a93e214957f20e87af2f10195295cc",
			
 
				-                "sha256:dc5c7cceaf33c125b727cf92709e70035d94dd507831dae66406ac762a18a253"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==0.2.5"
			
 
				-        },
			
 
				-        "idna": {
			
 
				-            "hashes": [
			
 
				-                "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6",
			
 
				-                "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				-            "version": "==2.10"
			
 
				-        },
			
 
				-        "importlib-metadata": {
			
 
				-            "hashes": [
			
 
				-                "sha256:c9db46394197244adf2f0b08ec5bc3cf16757e9590b02af1fca085c16c0d600a",
			
 
				-                "sha256:d2d46ef77ffc85cbf7dac7e81dd663fde71c45326131bea8033b9bad42268ebe"
			
 
				-            ],
			
 
				-            "markers": "python_version < '3.8'",
			
 
				-            "version": "==3.10.0"
			
 
				-        },
			
 
				-        "influxdb": {
			
 
				-            "hashes": [
			
 
				-                "sha256:46f85e7b04ee4b3dee894672be6a295c94709003a7ddea8820deec2ac4d8b27a",
			
 
				-                "sha256:65040a1f53d1a2a4f88a677e89e3a98189a7d30cf2ab61c318aaa89733280747"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==5.3.1"
			
 
				-        },
			
 
				-        "joblib": {
			
 
				-            "hashes": [
			
 
				-                "sha256:9c17567692206d2f3fb9ecf5e991084254fe631665c450b443761c4186a613f7",
			
 
				-                "sha256:feeb1ec69c4d45129954f1b7034954241eedfd6ba39b5e9e4b6883be3332d5e5"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.6'",
			
 
				-            "version": "==1.0.1"
			
 
				-        },
			
 
				-        "jsonref": {
			
 
				-            "hashes": [
			
 
				-                "sha256:b1e82fa0b62e2c2796a13e5401fe51790b248f6d9bf9d7212a3e31a3501b291f",
			
 
				-                "sha256:f3c45b121cf6257eafabdc3a8008763aed1cd7da06dbabc59a9e4d2a5e4e6697"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==0.2"
			
 
				-        },
			
 
				-        "msgpack": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0cb94ee48675a45d3b86e61d13c1e6f1696f0183f0715544976356ff86f741d9",
			
 
				-                "sha256:1026dcc10537d27dd2d26c327e552f05ce148977e9d7b9f1718748281b38c841",
			
 
				-                "sha256:26a1759f1a88df5f1d0b393eb582ec022326994e311ba9c5818adc5374736439",
			
 
				-                "sha256:2a5866bdc88d77f6e1370f82f2371c9bc6fc92fe898fa2dec0c5d4f5435a2694",
			
 
				-                "sha256:31c17bbf2ae5e29e48d794c693b7ca7a0c73bd4280976d408c53df421e838d2a",
			
 
				-                "sha256:497d2c12426adcd27ab83144057a705efb6acc7e85957a51d43cdcf7f258900f",
			
 
				-                "sha256:5a9ee2540c78659a1dd0b110f73773533ee3108d4e1219b5a15a8d635b7aca0e",
			
 
				-                "sha256:8521e5be9e3b93d4d5e07cb80b7e32353264d143c1f072309e1863174c6aadb1",
			
 
				-                "sha256:87869ba567fe371c4555d2e11e4948778ab6b59d6cc9d8460d543e4cfbbddd1c",
			
 
				-                "sha256:8ffb24a3b7518e843cd83538cf859e026d24ec41ac5721c18ed0c55101f9775b",
			
 
				-                "sha256:92be4b12de4806d3c36810b0fe2aeedd8d493db39e2eb90742b9c09299eb5759",
			
 
				-                "sha256:9ea52fff0473f9f3000987f313310208c879493491ef3ccf66268eff8d5a0326",
			
 
				-                "sha256:a4355d2193106c7aa77c98fc955252a737d8550320ecdb2e9ac701e15e2943bc",
			
 
				-                "sha256:a99b144475230982aee16b3d249170f1cccebf27fb0a08e9f603b69637a62192",
			
 
				-                "sha256:ac25f3e0513f6673e8b405c3a80500eb7be1cf8f57584be524c4fa78fe8e0c83",
			
 
				-                "sha256:b28c0876cce1466d7c2195d7658cf50e4730667196e2f1355c4209444717ee06",
			
 
				-                "sha256:b55f7db883530b74c857e50e149126b91bb75d35c08b28db12dcb0346f15e46e",
			
 
				-                "sha256:b6d9e2dae081aa35c44af9c4298de4ee72991305503442a5c74656d82b581fe9",
			
 
				-                "sha256:c747c0cc08bd6d72a586310bda6ea72eeb28e7505990f342552315b229a19b33",
			
 
				-                "sha256:d6c64601af8f3893d17ec233237030e3110f11b8a962cb66720bf70c0141aa54",
			
 
				-                "sha256:d8167b84af26654c1124857d71650404336f4eb5cc06900667a493fc619ddd9f",
			
 
				-                "sha256:de6bd7990a2c2dabe926b7e62a92886ccbf809425c347ae7de277067f97c2887",
			
 
				-                "sha256:e36a812ef4705a291cdb4a2fd352f013134f26c6ff63477f20235138d1d21009",
			
 
				-                "sha256:e89ec55871ed5473a041c0495b7b4e6099f6263438e0bd04ccd8418f92d5d7f2",
			
 
				-                "sha256:f3e6aaf217ac1c7ce1563cf52a2f4f5d5b1f64e8729d794165db71da57257f0c",
			
 
				-                "sha256:f484cd2dca68502de3704f056fa9b318c94b1539ed17a4c784266df5d6978c87",
			
 
				-                "sha256:fae04496f5bc150eefad4e9571d1a76c55d021325dcd484ce45065ebbdd00984",
			
 
				-                "sha256:fe07bc6735d08e492a327f496b7850e98cb4d112c56df69b0c844dbebcbb47f6"
			
 
				-            ],
			
 
				-            "version": "==1.0.2"
			
 
				-        },
			
 
				-        "mysql": {
			
 
				-            "hashes": [
			
 
				-                "sha256:55e66b5e7b3823b1da5fb2a063e95a628fb850b2a0b76bdcd884faac5d2daa7d"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==0.0.2"
			
 
				-        },
			
 
				-        "mysqlclient": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0ac0dd759c4ca02c35a9fedc24bc982cf75171651e8187c2495ec957a87dfff7",
			
 
				-                "sha256:3381ca1a4f37ff1155fcfde20836b46416d66531add8843f6aa6d968982731c3",
			
 
				-                "sha256:71c4b330cf2313bbda0307fc858cc9055e64493ba9bf28454d25cf8b3ee8d7f5",
			
 
				-                "sha256:f6ebea7c008f155baeefe16c56cd3ee6239f7a5a9ae42396c2f1860f08a7c432",
			
 
				-                "sha256:fc575093cf81b6605bed84653e48b277318b880dc9becf42dd47fa11ffd3e2b6"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.5'",
			
 
				-            "version": "==2.0.3"
			
 
				-        },
			
 
				-        "networkx": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0635858ed7e989f4c574c2328380b452df892ae85084144c73d8cd819f0c4e06",
			
 
				-                "sha256:109cd585cac41297f71103c3c42ac6ef7379f29788eb54cb751be5a663bb235a"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.6'",
			
 
				-            "version": "==2.5.1"
			
 
				-        },
			
 
				-        "numpy": {
			
 
				-            "hashes": [
			
 
				-                "sha256:2428b109306075d89d21135bdd6b785f132a1f5a3260c371cee1fae427e12727",
			
 
				-                "sha256:377751954da04d4a6950191b20539066b4e19e3b559d4695399c5e8e3e683bf6",
			
 
				-                "sha256:4703b9e937df83f5b6b7447ca5912b5f5f297aba45f91dbbbc63ff9278c7aa98",
			
 
				-                "sha256:471c0571d0895c68da309dacee4e95a0811d0a9f9f532a48dc1bea5f3b7ad2b7",
			
 
				-                "sha256:61d5b4cf73622e4d0c6b83408a16631b670fc045afd6540679aa35591a17fe6d",
			
 
				-                "sha256:6c915ee7dba1071554e70a3664a839fbc033e1d6528199d4621eeaaa5487ccd2",
			
 
				-                "sha256:6e51e417d9ae2e7848314994e6fc3832c9d426abce9328cf7571eefceb43e6c9",
			
 
				-                "sha256:719656636c48be22c23641859ff2419b27b6bdf844b36a2447cb39caceb00935",
			
 
				-                "sha256:780ae5284cb770ade51d4b4a7dce4faa554eb1d88a56d0e8b9f35fca9b0270ff",
			
 
				-                "sha256:878922bf5ad7550aa044aa9301d417e2d3ae50f0f577de92051d739ac6096cee",
			
 
				-                "sha256:924dc3f83de20437de95a73516f36e09918e9c9c18d5eac520062c49191025fb",
			
 
				-                "sha256:97ce8b8ace7d3b9288d88177e66ee75480fb79b9cf745e91ecfe65d91a856042",
			
 
				-                "sha256:9c0fab855ae790ca74b27e55240fe4f2a36a364a3f1ebcfd1fb5ac4088f1cec3",
			
 
				-                "sha256:9cab23439eb1ebfed1aaec9cd42b7dc50fc96d5cd3147da348d9161f0501ada5",
			
 
				-                "sha256:a8e6859913ec8eeef3dbe9aed3bf475347642d1cdd6217c30f28dee8903528e6",
			
 
				-                "sha256:aa046527c04688af680217fffac61eec2350ef3f3d7320c07fd33f5c6e7b4d5f",
			
 
				-                "sha256:abc81829c4039e7e4c30f7897938fa5d4916a09c2c7eb9b244b7a35ddc9656f4",
			
 
				-                "sha256:bad70051de2c50b1a6259a6df1daaafe8c480ca98132da98976d8591c412e737",
			
 
				-                "sha256:c73a7975d77f15f7f68dacfb2bca3d3f479f158313642e8ea9058eea06637931",
			
 
				-                "sha256:d15007f857d6995db15195217afdbddfcd203dfaa0ba6878a2f580eaf810ecd6",
			
 
				-                "sha256:d76061ae5cab49b83a8cf3feacefc2053fac672728802ac137dd8c4123397677",
			
 
				-                "sha256:e8e4fbbb7e7634f263c5b0150a629342cc19b47c5eba8d1cd4363ab3455ab576",
			
 
				-                "sha256:e9459f40244bb02b2f14f6af0cd0732791d72232bbb0dc4bab57ef88e75f6935",
			
 
				-                "sha256:edb1f041a9146dcf02cd7df7187db46ab524b9af2515f392f337c7cbbf5b52cd"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.7'",
			
 
				-            "version": "==1.20.2"
			
 
				-        },
			
 
				-        "pandas": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b",
			
 
				-                "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f",
			
 
				-                "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648",
			
 
				-                "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb",
			
 
				-                "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98",
			
 
				-                "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a",
			
 
				-                "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11",
			
 
				-                "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a",
			
 
				-                "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e",
			
 
				-                "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae",
			
 
				-                "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d",
			
 
				-                "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9",
			
 
				-                "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b",
			
 
				-                "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788",
			
 
				-                "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca",
			
 
				-                "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5",
			
 
				-                "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a",
			
 
				-                "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814",
			
 
				-                "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d",
			
 
				-                "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086",
			
 
				-                "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a",
			
 
				-                "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb",
			
 
				-                "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782",
			
 
				-                "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==1.1.5"
			
 
				-        },
			
 
				-        "pymongo": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0384d76b409278ddb34ac19cdc4664511685959bf719adbdc051875ded4689aa",
			
 
				-                "sha256:05e2bda928a3a6bc6ddff9e5a8579d41928b75d7417b18f9a67c82bb52150ac6",
			
 
				-                "sha256:152e4ac3158b776135d8fce28d2ac06e682b885fcbe86690d66465f262ab244e",
			
 
				-                "sha256:180511abfef70feb022360b35f4863dd68e08334197089201d5c52208de9ca2e",
			
 
				-                "sha256:19d52c60dc37520385f538d6d1a4c40bc398e0885f4ed6a36ce10b631dab2852",
			
 
				-                "sha256:1d559a76ae87143ad96c2ecd6fdd38e691721e175df7ced3fcdc681b4638bca1",
			
 
				-                "sha256:210ec4a058480b9c3869082e52b66d80c4a48eda9682d7a569a1a5a48100ea54",
			
 
				-                "sha256:2163d736d6f62b20753be5da3dc07a188420b355f057fcbb3075b05ee6227b2f",
			
 
				-                "sha256:22ee2c94fee1e391735be63aa1c9af4c69fdcb325ae9e5e4ddff770248ef60a6",
			
 
				-                "sha256:28633868be21a187702a8613913e13d1987d831529358c29fc6f6670413df040",
			
 
				-                "sha256:29390c39ca873737689a0749c9c3257aad96b323439b11279fbc0ba8626ec9c5",
			
 
				-                "sha256:2aeb108da1ed8e066800fb447ba5ae89d560e6773d228398a87825ac3630452d",
			
 
				-                "sha256:322f6cc7bf23a264151ebc5229a92600c4b55ac83c83c91c9bab1ec92c888a8d",
			
 
				-                "sha256:34c15f5798f23488e509eae82fbf749c3d17db74379a88c07c869ece1aa806b9",
			
 
				-                "sha256:3873866534b6527e6863e742eb23ea2a539e3c7ee00ad3f9bec9da27dbaaff6f",
			
 
				-                "sha256:3dbc67754882d740f17809342892f0b24398770bd99d48c5cb5ba89f5f5dee4e",
			
 
				-                "sha256:413b18ac2222f5d961eb8d1c8dcca6c6ca176c8613636d8c13aa23abae7f7a21",
			
 
				-                "sha256:42f9ec9d77358f557fe17cc15e796c4d4d492ede1a30cba3664822cae66e97c5",
			
 
				-                "sha256:4ac387ac1be71b798d1c372a924f9c30352f30e684e06f086091297352698ac0",
			
 
				-                "sha256:4ca92e15fcf02e02e7c24b448a16599b98c9d0e6a46cd85cc50804450ebf7245",
			
 
				-                "sha256:4d959e929cec805c2bf391418b1121590b4e7d5cb00af7b1ba521443d45a0918",
			
 
				-                "sha256:5091aacbdb667b418b751157f48f6daa17142c4f9063d58e5a64c90b2afbdf9a",
			
 
				-                "sha256:5a03ae5ac85b04b2034a0689add9ff597b16d5e24066a87f6ab0e9fa67049156",
			
 
				-                "sha256:5e1341276ce8b7752db9aeac6bbb0cbe82a3f6a6186866bf6b4906d8d328d50b",
			
 
				-                "sha256:6043d251fac27ca04ff22ed8deb5ff7a43dc18e8a4a15b4c442d2a20fa313162",
			
 
				-                "sha256:610d5cbbfd026e2f6d15665af51e048e49b68363fedece2ed318cc8fe080dd94",
			
 
				-                "sha256:622a5157ffcd793d305387c1c9fb94185f496c8c9fd66dafb59de0807bc14ad7",
			
 
				-                "sha256:65b67637f0a25ac9d25efb13c1578eb065870220ffa82f132c5b2d8e43ac39c3",
			
 
				-                "sha256:66573c8c7808cce4f3b56c23cb7cad6c3d7f4c464b9016d35f5344ad743896d7",
			
 
				-                "sha256:66b688fc139c6742057795510e3b12c4acbf90d11af1eff9689a41d9c84478d6",
			
 
				-                "sha256:685b884fa41bd2913fd20af85866c4ff886b7cbb7e4833b918996aa5d45a04be",
			
 
				-                "sha256:6a5834e392c97f19f36670e34bf9d346d733ad89ee0689a6419dd737dfa4308a",
			
 
				-                "sha256:728313cc0d59d1a1a004f675607dcf5c711ced3f55e75d82b3f264fd758869f3",
			
 
				-                "sha256:733e1cfffc4cd99848230e2999c8a86e284c6af6746482f8ad2ad554dce14e39",
			
 
				-                "sha256:7814b2cf23aad23464859973c5cd2066ca2fd99e0b934acefbb0b728ac2525bf",
			
 
				-                "sha256:7c77801620e5e75fb9c7abae235d3cc45d212a67efa98f4972eef63e736a8daa",
			
 
				-                "sha256:7cd42c66d49ffb68dea065e1c8a4323e7ceab386e660fee9863d4fa227302ba9",
			
 
				-                "sha256:7d2ae2f7c50adec20fde46a73465de31a6a6fbb4903240f8b7304549752ca7a1",
			
 
				-                "sha256:7edff02e44dd0badd749d7342e40705a398d98c5d8f7570f57cff9568c2351fa",
			
 
				-                "sha256:87981008d565f647142869d99915cc4760b7725858da3d39ecb2a606e23f36fd",
			
 
				-                "sha256:92e2376ce3ca0e3e443b3c5c2bb5d584c7e59221edfb0035313c6306049ba55a",
			
 
				-                "sha256:950710f7370613a6bfa2ccd842b488c5b8072e83fb6b7d45d99110bf44651d06",
			
 
				-                "sha256:980527f4ccc6644855bb68056fe7835da6d06d37776a52df5bcc1882df57c3db",
			
 
				-                "sha256:9fbffc5bad4df99a509783cbd449ed0d24fcd5a450c28e7756c8f20eda3d2aa5",
			
 
				-                "sha256:a8b02e0119d6ee381a265d8d2450a38096f82916d895fed2dfd81d4c7a54d6e4",
			
 
				-                "sha256:b17e627844d86031c77147c40bf992a6e1114025a460874deeda6500d0f34862",
			
 
				-                "sha256:b1aa62903a2c5768b0001632efdea2e8da6c80abdd520c2e8a16001cc9affb23",
			
 
				-                "sha256:b32e4eed2ef19a20dfb57698497a9bc54e74efb2e260c003e9056c145f130dc7",
			
 
				-                "sha256:b44fa04720bbfd617b6aef036989c8c30435f11450c0a59136291d7b41ed647f",
			
 
				-                "sha256:b4535d98df83abebb572035754fb3d4ad09ce7449375fa09fa9ede2dbc87b62b",
			
 
				-                "sha256:bb6a5777bf558f444cd4883d617546182cfeff8f2d4acd885253f11a16740534",
			
 
				-                "sha256:bc2eb67387b8376120a2be6cba9d23f9d6a6c3828e00fb0a64c55ad7b54116d1",
			
 
				-                "sha256:bd351ceb2decd23d523fc50bad631ee9ae6e97e7cdc355ce5600fe310484f96e",
			
 
				-                "sha256:bf70097bd497089f1baabf9cbb3ec4f69c022dc7a70c41ba9c238fa4d0fff7ab",
			
 
				-                "sha256:c7fd18d4b7939408df9315fedbdb05e179760960a92b3752498e2fcd03f24c3d",
			
 
				-                "sha256:cc359e408712faf9ea775f4c0ec8f2bfc843afe47747a657808d9595edd34d71",
			
 
				-                "sha256:cd8fc35d4c0c717cc29b0cb894871555cb7137a081e179877ecc537e2607f0b9",
			
 
				-                "sha256:daa44cefde19978af57ac1d50413cd86ebf2b497328e7a27832f5824bda47439",
			
 
				-                "sha256:db5098587f58fbf8582d9bda2462762b367207246d3e19623782fb449c3c5fcc",
			
 
				-                "sha256:db6fd53ef5f1914ad801830406440c3bfb701e38a607eda47c38adba267ba300",
			
 
				-                "sha256:e1414599a97554d451e441afb362dbee1505e4550852c0068370d843757a3fe2",
			
 
				-                "sha256:ee42a8f850143ae7c67ea09a183a6a4ad8d053e1dbd9a1134e21a7b5c1bc6c73",
			
 
				-                "sha256:f23abcf6eca5859a2982beadfb5111f8c5e76e30ff99aaee3c1c327f814f9f10",
			
 
				-                "sha256:f6748c447feeadda059719ef5ab1fb9d84bd370e205b20049a0e8b45ef4ad593"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==3.11.3"
			
 
				-        },
			
 
				-        "pymysql": {
			
 
				-            "hashes": [
			
 
				-                "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641",
			
 
				-                "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==1.0.2"
			
 
				-        },
			
 
				-        "python-dateutil": {
			
 
				-            "hashes": [
			
 
				-                "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
			
 
				-                "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				-            "version": "==2.8.1"
			
 
				-        },
			
 
				-        "pytz": {
			
 
				-            "hashes": [
			
 
				-                "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da",
			
 
				-                "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"
			
 
				-            ],
			
 
				-            "version": "==2021.1"
			
 
				-        },
			
 
				-        "requests": {
			
 
				-            "hashes": [
			
 
				-                "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
			
 
				-                "sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
			
 
				-            "version": "==2.25.1"
			
 
				-        },
			
 
				-        "scikit-learn": {
			
 
				-            "hashes": [
			
 
				-                "sha256:0567a2d29ad08af98653300c623bd8477b448fe66ced7198bef4ed195925f082",
			
 
				-                "sha256:087dfede39efb06ab30618f9ab55a0397f29c38d63cd0ab88d12b500b7d65fd7",
			
 
				-                "sha256:1adf483e91007a87171d7ce58c34b058eb5dab01b5fee6052f15841778a8ecd8",
			
 
				-                "sha256:259ec35201e82e2db1ae2496f229e63f46d7f1695ae68eef9350b00dc74ba52f",
			
 
				-                "sha256:3c4f07f47c04e81b134424d53c3f5e16dfd7f494e44fd7584ba9ce9de2c5e6c1",
			
 
				-                "sha256:4562dcf4793e61c5d0f89836d07bc37521c3a1889da8f651e2c326463c4bd697",
			
 
				-                "sha256:4ddd2b6f7449a5d539ff754fa92d75da22de261fd8fdcfb3596799fadf255101",
			
 
				-                "sha256:54be0a60a5a35005ad69c75902e0f5c9f699db4547ead427e97ef881c3242e6f",
			
 
				-                "sha256:5580eba7345a4d3b097be2f067cc71a306c44bab19e8717a30361f279c929bea",
			
 
				-                "sha256:7b04691eb2f41d2c68dbda8d1bd3cb4ef421bdc43aaa56aeb6c762224552dfb6",
			
 
				-                "sha256:826b92bf45b8ad80444814e5f4ac032156dd481e48d7da33d611f8fe96d5f08b",
			
 
				-                "sha256:83b21ff053b1ff1c018a2d24db6dd3ea339b1acfbaa4d9c881731f43748d8b3b",
			
 
				-                "sha256:8772b99d683be8f67fcc04789032f1b949022a0e6880ee7b75a7ec97dbbb5d0b",
			
 
				-                "sha256:895dbf2030aa7337649e36a83a007df3c9811396b4e2fa672a851160f36ce90c",
			
 
				-                "sha256:8aa1b3ac46b80eaa552b637eeadbbce3be5931e4b5002b964698e33a1b589e1e",
			
 
				-                "sha256:9599a3f3bf33f73fed0fe06d1dfa4e6081365a58c1c807acb07271be0dce9733",
			
 
				-                "sha256:99349d77f54e11f962d608d94dfda08f0c9e5720d97132233ebdf35be2858b2d",
			
 
				-                "sha256:9a24d1ccec2a34d4cd3f2a1f86409f3f5954cc23d4d2270ba0d03cf018aa4780",
			
 
				-                "sha256:9bed8a1ef133c8e2f13966a542cb8125eac7f4b67dcd234197c827ba9c7dd3e0",
			
 
				-                "sha256:9c6097b6a9b2bafc5e0f31f659e6ab5e131383209c30c9e978c5b8abdac5ed2a",
			
 
				-                "sha256:9dfa564ef27e8e674aa1cc74378416d580ac4ede1136c13dd555a87996e13422",
			
 
				-                "sha256:a0334a1802e64d656022c3bfab56a73fbd6bf4b1298343f3688af2151810bbdf",
			
 
				-                "sha256:a29460499c1e62b7a830bb57ca42e615375a6ab1bcad053cd25b493588348ea8",
			
 
				-                "sha256:a36e159a0521e13bbe15ca8c8d038b3a1dd4c7dad18d276d76992e03b92cf643",
			
 
				-                "sha256:abe835a851610f87201819cb315f8d554e1a3e8128912783a31e87264ba5ffb7",
			
 
				-                "sha256:c13ebac42236b1c46397162471ea1c46af68413000e28b9309f8c05722c65a09",
			
 
				-                "sha256:c3deb3b19dd9806acf00cf0d400e84562c227723013c33abefbbc3cf906596e9",
			
 
				-                "sha256:c658432d8a20e95398f6bb95ff9731ce9dfa343fdf21eea7ec6a7edfacd4b4d9",
			
 
				-                "sha256:c7f4eb77504ac586d8ac1bde1b0c04b504487210f95297235311a0ab7edd7e38",
			
 
				-                "sha256:d54dbaadeb1425b7d6a66bf44bee2bb2b899fe3e8850b8e94cfb9c904dcb46d0",
			
 
				-                "sha256:ddb52d088889f5596bc4d1de981f2eca106b58243b6679e4782f3ba5096fd645",
			
 
				-                "sha256:ed9d65594948678827f4ff0e7ae23344e2f2b4cabbca057ccaed3118fdc392ca",
			
 
				-                "sha256:fab31f48282ebf54dd69f6663cd2d9800096bad1bb67bbc9c9ac84eb77b41972"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.6'",
			
 
				-            "version": "==0.24.1"
			
 
				-        },
			
 
				-        "scipy": {
			
 
				-            "hashes": [
			
 
				-                "sha256:03f1fd3574d544456325dae502facdf5c9f81cbfe12808a5e67a737613b7ba8c",
			
 
				-                "sha256:0c81ea1a95b4c9e0a8424cf9484b7b8fa7ef57169d7bcc0dfcfc23e3d7c81a12",
			
 
				-                "sha256:1fba8a214c89b995e3721670e66f7053da82e7e5d0fe6b31d8e4b19922a9315e",
			
 
				-                "sha256:37f4c2fb904c0ba54163e03993ce3544c9c5cde104bcf90614f17d85bdfbb431",
			
 
				-                "sha256:50e5bcd9d45262725e652611bb104ac0919fd25ecb78c22f5282afabd0b2e189",
			
 
				-                "sha256:6ca1058cb5bd45388041a7c3c11c4b2bd58867ac9db71db912501df77be2c4a4",
			
 
				-                "sha256:77f7a057724545b7e097bfdca5c6006bed8580768cd6621bb1330aedf49afba5",
			
 
				-                "sha256:816951e73d253a41fa2fd5f956f8e8d9ac94148a9a2039e7db56994520582bf2",
			
 
				-                "sha256:96620240b393d155097618bcd6935d7578e85959e55e3105490bbbf2f594c7ad",
			
 
				-                "sha256:993c86513272bc84c451349b10ee4376652ab21f312b0554fdee831d593b6c02",
			
 
				-                "sha256:adf7cee8e5c92b05f2252af498f77c7214a2296d009fc5478fc432c2f8fb953b",
			
 
				-                "sha256:bc52d4d70863141bb7e2f8fd4d98e41d77375606cde50af65f1243ce2d7853e8",
			
 
				-                "sha256:c1d3f771c19af00e1a36f749bd0a0690cc64632783383bc68f77587358feb5a4",
			
 
				-                "sha256:d744657c27c128e357de2f0fd532c09c84cd6e4933e8232895a872e67059ac37",
			
 
				-                "sha256:e3e9742bad925c421d39e699daa8d396c57535582cba90017d17f926b61c1552",
			
 
				-                "sha256:e547f84cd52343ac2d56df0ab08d3e9cc202338e7d09fafe286d6c069ddacb31",
			
 
				-                "sha256:e89091e6a8e211269e23f049473b2fde0c0e5ae0dd5bd276c3fc91b97da83480",
			
 
				-                "sha256:e9da33e21c9bc1b92c20b5328adb13e5f193b924c9b969cd700c8908f315aa59",
			
 
				-                "sha256:ffdfb09315896c6e9ac739bb6e13a19255b698c24e6b28314426fd40a1180822"
			
 
				-            ],
			
 
				-            "markers": "python_version < '3.10' and python_version >= '3.7'",
			
 
				-            "version": "==1.6.2"
			
 
				-        },
			
 
				-        "simplejson": {
			
 
				-            "hashes": [
			
 
				-                "sha256:034550078a11664d77bc1a8364c90bb7eef0e44c2dbb1fd0a4d92e3997088667",
			
 
				-                "sha256:05b43d568300c1cd43f95ff4bfcff984bc658aa001be91efb3bb21df9d6288d3",
			
 
				-                "sha256:0dd9d9c738cb008bfc0862c9b8fa6743495c03a0ed543884bf92fb7d30f8d043",
			
 
				-                "sha256:10fc250c3edea4abc15d930d77274ddb8df4803453dde7ad50c2f5565a18a4bb",
			
 
				-                "sha256:2862beabfb9097a745a961426fe7daf66e1714151da8bb9a0c430dde3d59c7c0",
			
 
				-                "sha256:292c2e3f53be314cc59853bd20a35bf1f965f3bc121e007ab6fd526ed412a85d",
			
 
				-                "sha256:2d3eab2c3fe52007d703a26f71cf649a8c771fcdd949a3ae73041ba6797cfcf8",
			
 
				-                "sha256:2e7b57c2c146f8e4dadf84977a83f7ee50da17c8861fd7faf694d55e3274784f",
			
 
				-                "sha256:311f5dc2af07361725033b13cc3d0351de3da8bede3397d45650784c3f21fbcf",
			
 
				-                "sha256:344e2d920a7f27b4023c087ab539877a1e39ce8e3e90b867e0bfa97829824748",
			
 
				-                "sha256:3fabde09af43e0cbdee407555383063f8b45bfb52c361bc5da83fcffdb4fd278",
			
 
				-                "sha256:42b8b8dd0799f78e067e2aaae97e60d58a8f63582939af60abce4c48631a0aa4",
			
 
				-                "sha256:4b3442249d5e3893b90cb9f72c7d6ce4d2ea144d2c0d9f75b9ae1e5460f3121a",
			
 
				-                "sha256:55d65f9cc1b733d85ef95ab11f559cce55c7649a2160da2ac7a078534da676c8",
			
 
				-                "sha256:5c659a0efc80aaaba57fcd878855c8534ecb655a28ac8508885c50648e6e659d",
			
 
				-                "sha256:72d8a3ffca19a901002d6b068cf746be85747571c6a7ba12cbcf427bfb4ed971",
			
 
				-                "sha256:75ecc79f26d99222a084fbdd1ce5aad3ac3a8bd535cd9059528452da38b68841",
			
 
				-                "sha256:76ac9605bf2f6d9b56abf6f9da9047a8782574ad3531c82eae774947ae99cc3f",
			
 
				-                "sha256:7d276f69bfc8c7ba6c717ba8deaf28f9d3c8450ff0aa8713f5a3280e232be16b",
			
 
				-                "sha256:7f10f8ba9c1b1430addc7dd385fc322e221559d3ae49b812aebf57470ce8de45",
			
 
				-                "sha256:8042040af86a494a23c189b5aa0ea9433769cc029707833f261a79c98e3375f9",
			
 
				-                "sha256:813846738277729d7db71b82176204abc7fdae2f566e2d9fcf874f9b6472e3e6",
			
 
				-                "sha256:845a14f6deb124a3bcb98a62def067a67462a000e0508f256f9c18eff5847efc",
			
 
				-                "sha256:869a183c8e44bc03be1b2bbcc9ec4338e37fa8557fc506bf6115887c1d3bb956",
			
 
				-                "sha256:8acf76443cfb5c949b6e781c154278c059b09ac717d2757a830c869ba000cf8d",
			
 
				-                "sha256:8f713ea65958ef40049b6c45c40c206ab363db9591ff5a49d89b448933fa5746",
			
 
				-                "sha256:934115642c8ba9659b402c8bdbdedb48651fb94b576e3b3efd1ccb079609b04a",
			
 
				-                "sha256:9551f23e09300a9a528f7af20e35c9f79686d46d646152a0c8fc41d2d074d9b0",
			
 
				-                "sha256:9a2b7543559f8a1c9ed72724b549d8cc3515da7daf3e79813a15bdc4a769de25",
			
 
				-                "sha256:a55c76254d7cf8d4494bc508e7abb993a82a192d0db4552421e5139235604625",
			
 
				-                "sha256:ad8f41c2357b73bc9e8606d2fa226233bf4d55d85a8982ecdfd55823a6959995",
			
 
				-                "sha256:af4868da7dd53296cd7630687161d53a7ebe2e63814234631445697bd7c29f46",
			
 
				-                "sha256:afebfc3dd3520d37056f641969ce320b071bc7a0800639c71877b90d053e087f",
			
 
				-                "sha256:b59aa298137ca74a744c1e6e22cfc0bf9dca3a2f41f51bc92eb05695155d905a",
			
 
				-                "sha256:bc00d1210567a4cdd215ac6e17dc00cb9893ee521cee701adfd0fa43f7c73139",
			
 
				-                "sha256:c1cb29b1fced01f97e6d5631c3edc2dadb424d1f4421dad079cb13fc97acb42f",
			
 
				-                "sha256:c94dc64b1a389a416fc4218cd4799aa3756f25940cae33530a4f7f2f54f166da",
			
 
				-                "sha256:ceaa28a5bce8a46a130cd223e895080e258a88d51bf6e8de2fc54a6ef7e38c34",
			
 
				-                "sha256:cff6453e25204d3369c47b97dd34783ca820611bd334779d22192da23784194b",
			
 
				-                "sha256:d0b64409df09edb4c365d95004775c988259efe9be39697d7315c42b7a5e7e94",
			
 
				-                "sha256:d4813b30cb62d3b63ccc60dd12f2121780c7a3068db692daeb90f989877aaf04",
			
 
				-                "sha256:da3c55cdc66cfc3fffb607db49a42448785ea2732f055ac1549b69dcb392663b",
			
 
				-                "sha256:e058c7656c44fb494a11443191e381355388443d543f6fc1a245d5d238544396",
			
 
				-                "sha256:fed0f22bf1313ff79c7fc318f7199d6c2f96d4de3234b2f12a1eab350e597c06",
			
 
				-                "sha256:ffd4e4877a78c84d693e491b223385e0271278f5f4e1476a4962dca6824ecfeb"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==3.17.2"
			
 
				-        },
			
 
				-        "six": {
			
 
				-            "hashes": [
			
 
				-                "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
			
 
				-                "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				-            "version": "==1.15.0"
			
 
				-        },
			
 
				-        "sklearn": {
			
 
				-            "hashes": [
			
 
				-                "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
			
 
				-            ],
			
 
				-            "version": "==0.0"
			
 
				-        },
			
 
				-        "sqlalchemy": {
			
 
				-            "hashes": [
			
 
				-                "sha256:013b659efe02f0f58e7f759602584899c921c178c6a972978f16460dcdd782d5",
			
 
				-                "sha256:193c3ca465fbc68de071995a461ab535466f041089d372ee6a6f0aae7b9307e6",
			
 
				-                "sha256:2071ee6cd9390a9527a80ef03458fb58e0166bb299db2c62f9d688b6772d76a1",
			
 
				-                "sha256:21becd8b45ec70b703239cf915104e47889c2aad96d0f68f597b9b547cbfd787",
			
 
				-                "sha256:2713b338d9c54d2c3c7ff4f7786a40a5ca85013c8ccea00327b034d42598e22e",
			
 
				-                "sha256:2a042c27b1a32a87f4cead53bcdd28999324992650896094368a595165b31d97",
			
 
				-                "sha256:2e65c1146f5b4151cc6e553d9847299c97f53640d94ba88b1c534e15cdc6ac38",
			
 
				-                "sha256:345c201324066b789804411f07eea750e9f29872be052eba221ce76add647d50",
			
 
				-                "sha256:360a771b538463053383fb6ff7aceffb595248d7059bb9e003bf70562a66510d",
			
 
				-                "sha256:432e98e6fe0d24e8181eb4177e59cba9f8831dcaf272a0d2de75bc8b933952a0",
			
 
				-                "sha256:4387ebd5ae8bc2c716dbfc1ece769c867307eeecc192e72a4d2e7fa0fc092646",
			
 
				-                "sha256:43fef20dd1024409375cc646a4b5afaffb62f6488e41588cde2a1ed2e9432b5b",
			
 
				-                "sha256:4d71ee83441826fb48771e58cef51191500a87734b4acb6b698ca018479395bd",
			
 
				-                "sha256:4eeff8b12c7d22be4de98721bba5a042875f4365e9fd20dc3916eec474ccb81e",
			
 
				-                "sha256:534c71caa87c7fdb136ce5073fb42b732a4eb390946f503d8e1d7ce6a4a79100",
			
 
				-                "sha256:66467123c220689d55c6d51fdf88f7b0b62b8078823c5f6c0297ab47c22003d7",
			
 
				-                "sha256:6c4af3aceeff6a0e2bd3657d8b25714a9f7c7c606e7ec52029284973094f84c1",
			
 
				-                "sha256:7d252dea33c1ee07b3d702fb4962963996ea40e5a2615dbe7646ccabd851ac76",
			
 
				-                "sha256:86a7321636f851c6e8009901c5d67e97d82b86ee8c6f28a476691c41c3d71a95",
			
 
				-                "sha256:88d75ea6b4330a6f5596a49904f21762ff89ca763db065d63b815ad8c3d68952",
			
 
				-                "sha256:8a296bbf367867aee2ea8d5b391cb04fbdb3ca7277cd1649d9e8114620f3b090",
			
 
				-                "sha256:933427a5474e014d01bac93224cd4e2bc7bbc7ce531d0bd7e55e4f940cc8ce0d",
			
 
				-                "sha256:93f6fe67a76d7fa1cca3b9febb36e9f2dd76055230e2bfa317969532f34c03ab",
			
 
				-                "sha256:a687e552ab4ffedcf3ec3bd5256ab3e753b4f605b467e9fa39690b2dadb5f607",
			
 
				-                "sha256:a69787f7fc87b84df7e2f27158476cdf39a79ebb95af1d6f696e474724af9ebe",
			
 
				-                "sha256:a76c10b467f7d385e4cffe2185d975336acf0dbf24ed702c46207df0fb64055e",
			
 
				-                "sha256:b093bd6efb49332021714bed5752e784a34ae6d6896ec56ffdc32cc83275a215",
			
 
				-                "sha256:bdeb300bb9adc02f98957cd0cf0c38d641bdd435b0927e39870a772e0a750bc0",
			
 
				-                "sha256:c719f0058951457a7761bb69c2e47781a9989ab4819b7a30b6b39141ad013a5f",
			
 
				-                "sha256:cadb58aeadd9916e79e8f99a49d0c0a9e61ae2b24469c2b304a0699e41a25e59",
			
 
				-                "sha256:cc3c0d87b11ae1dd1ccbd6fc7875a290b3f73b771254180c2e7b19c2aec7379b",
			
 
				-                "sha256:d42b8e2bffdf9e01d66cf46472b938493b854ea790a0fbe2e2e42624fc253b33",
			
 
				-                "sha256:d7684e0598acfbfb5110bea482d8c5e94f52001d6d66b5558177f41f49fb5930",
			
 
				-                "sha256:e5267cd2e51ddefbe10bb182c36ba41cdaa51c83a0fdfa63ed8cbe89cbcf0f33"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==1.4.6"
			
 
				-        },
			
 
				-        "sqlalchemy-utils": {
			
 
				-            "hashes": [
			
 
				-                "sha256:fb66e9956e41340011b70b80f898fde6064ec1817af77199ee21ace71d7d6ab0"
			
 
				-            ],
			
 
				-            "version": "==0.36.8"
			
 
				-        },
			
 
				-        "sqlparse": {
			
 
				-            "hashes": [
			
 
				-                "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
			
 
				-                "sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
			
 
				-            ],
			
 
				-            "index": "pypi",
			
 
				-            "version": "==0.4.1"
			
 
				-        },
			
 
				-        "threadpoolctl": {
			
 
				-            "hashes": [
			
 
				-                "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725",
			
 
				-                "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.5'",
			
 
				-            "version": "==2.1.0"
			
 
				-        },
			
 
				-        "tqdm": {
			
 
				-            "hashes": [
			
 
				-                "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3",
			
 
				-                "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				-            "version": "==4.60.0"
			
 
				-        },
			
 
				-        "typing-extensions": {
			
 
				-            "hashes": [
			
 
				-                "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
			
 
				-                "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
			
 
				-                "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
			
 
				-            ],
			
 
				-            "markers": "python_version < '3.8'",
			
 
				-            "version": "==3.7.4.3"
			
 
				-        },
			
 
				-        "urllib3": {
			
 
				-            "hashes": [
			
 
				-                "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df",
			
 
				-                "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
			
 
				-            "version": "==1.26.4"
			
 
				-        },
			
 
				-        "zipp": {
			
 
				-            "hashes": [
			
 
				-                "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
			
 
				-                "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.6'",
			
 
				-            "version": "==3.4.1"
			
 
				-        }
			
 
				-    },
			
 
				-    "develop": {}
			
 
				-}
			
--- a/README.md
+++ b/README.md
--- a/cdplib/DataExplorer/DataExplorer.py
+++ b/cdplib/DataExplorer/DataExplorer.py
--- a/cdplib/FlattenData.py
+++ b/cdplib/FlattenData.py
--- a/cdplib/Singleton_Threadsafe.py
+++ b/cdplib/Singleton_Threadsafe.py
--- a/cdplib/__init__.py
+++ b/cdplib/__init__.py
--- a/cdplib/db_handlers/InfluxdbHandler.py
+++ b/cdplib/db_handlers/InfluxdbHandler.py
@@ -82,15 +82,15 @@ class InfluxdbHandler:
 
				         try:
			
 
				             # result of the query is a defaultdict
			
 
				             result = self.client.query(query)
			
 
				-            
			
 
				+
			
 
				             if len(list(result.values())) > 0:
			
 
				 
			
 
				                 return list(result.values())[0]
			
 
				-            
			
 
				+
			
 
				             else:
			
 
				-                
			
 
				+
			
 
				                 return pd.DataFrame()
			
 
				-            
			
 
				+
			
 
				         except Exception as e:
			
 
				             self._logger.log_and_raise_error(
			
 
				                 ("Could not query to dataframe. "
			
@@ -118,30 +118,30 @@ class InfluxdbHandler:
 
				 
			
 
				         if (stop is not None) and (not isinstance(stop, str)):
			
 
				             stop = datetime.strftime(stop, format="%Y-%m-%dT%H:%M:%SZ")
			
 
				-            
			
 
				+
			
 
				         query = 'SELECT ' + columns + ' FROM \"' + tables
			
 
				-            
			
 
				+
			
 
				         if (start is not None) and (stop is not None):
			
 
				-            
			
 
				+
			
 
				              query += '\" WHERE time > \'' +\
			
 
				                 str(start) +\
			
 
				                 '\' AND time  < \'' +\
			
 
				                 str(stop) +\
			
 
				                 '\' tz(\'Europe/Berlin\');'
			
 
				-                
			
 
				+
			
 
				         elif start is not None:
			
 
				-            
			
 
				+
			
 
				             query += '\" WHERE time >= \'' + str(start) +\
			
 
				                 '\' tz(\'Europe/Berlin\');'
			
 
				-            
			
 
				+
			
 
				         elif stop is not None:
			
 
				-            
			
 
				+
			
 
				             query += '\" WHERE time <= \'' + str(stop) +\
			
 
				                 '\' tz(\'Europe/Berlin\');'
			
 
				-                
			
 
				+
			
 
				         else:
			
 
				             query += ';'
			
 
				-            
			
 
				+
			
 
				 
			
 
				         return self.query_to_dataframe(query)
			
 
				 
			
@@ -150,13 +150,13 @@ class InfluxdbHandler:
 
				                          batch_size: int = 10000,
			
 
				                          time_precision: str = 'u'):
			
 
				         """
			
 
				-        Writes each column of the dataframe which is not 
			
 
				+        Writes each column of the dataframe which is not
			
 
				         in tag_columns as a separate measurement to the database.
			
 
				-        
			
 
				+
			
 
				         Tag columns are put as tags to each measurement.
			
 
				-        
			
 
				+
			
 
				         The dataframe has to have a datatime index!
			
 
				-        
			
 
				+
			
 
				         :param dataframe: dataframe to write to the database
			
 
				         :type dataframe: pd.DataFrame
			
 
				         :param tag_columns: column names to be used as tags
			
@@ -166,10 +166,10 @@ class InfluxdbHandler:
 
				         :param time_precision:
			
 
				         :type tiime_precision: str
			
 
				         """
			
 
				-        
			
 
				+
			
 
				         measurement_columns = [c for c in dataframe.columns
			
 
				                                if c not in (tag_columns or [])]
			
 
				-        
			
 
				+
			
 
				         for column in measurement_columns:
			
 
				             try:
			
 
				                 self.client.write_points(
			
@@ -187,4 +187,4 @@ class InfluxdbHandler:
 
				 
			
 
				 if __name__ == "__main__":
			
 
				 
			
 
				-    influx_handler = InfluxdbHandler()
			
 
				+    influx_handler = InfluxdbHandler()
			
--- a/cdplib/db_handlers/MongodbHandler.py
+++ b/cdplib/db_handlers/MongodbHandler.py
--- a/cdplib/db_handlers/SQLHandler.py
+++ b/cdplib/db_handlers/SQLHandler.py
@@ -508,7 +508,6 @@ class SQLHandler:
 
				         :rtype: DataFrame
			
 
				         '''
			
 
				         try:
			
 
				-            
			
 
				             connection = self._engine.connect()
			
 
				 
			
 
				             data = pd.read_sql(sql=query,
			
@@ -516,7 +515,7 @@ class SQLHandler:
 
				                                **read_sql_kwargs)
			
 
				 
			
 
				             connection.close()
			
 
				-           
			
 
				+
			
 
				             return data
			
 
				 
			
 
				         except Exception as e:
			
--- a/cdplib/db_handlers/__init__.py
+++ b/cdplib/db_handlers/__init__.py
@@ -1,2 +1,3 @@
 
				 from .MongodbHandler import *
			
 
				-from .SQLHandler import *
			
 
				+from .SQLHandler import *
			
 
				+from .InfluxdbHandler import *
			
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -255,9 +255,11 @@ class MigrationCleaning:
 
				             columns = db.get_column_names(tablename=self._inconsist_report_table)
			
 
				 
			
 
				             if len(columns) > 0:
			
 
				-                columns_not_in_data = [column for column in columns if column not in data.columns]
			
 
				-                for value in columns_not_in_data:
			
 
				-                    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
			
 
				+                # TODO Tanya:The commented lines caused the reason to be the same for all entries.
			
 
				+
			
 
				+                #columns_not_in_data = [column for column in columns if column not in data.columns]
			
 
				+                #for value in columns_not_in_data:
			
 
				+                #    data_inconsist[value] = 'Column does not exist in the mongo database and has therefore been dropped'
			
 
				                 data_inconsist = data_inconsist[columns]
			
 
				 
			
 
				         db.append_to_table(data=data_inconsist,
			
@@ -396,7 +398,7 @@ class MigrationCleaning:
 
				                     data[column] = data[column].astype(python_type)
			
 
				 
			
 
				                 elif python_type == float:
			
 
				-                    
			
 
				+
			
 
				                     data[column] = data[column].fillna(np.inf)
			
 
				                     # Replaces empty fields when type is string
			
 
				                     if data[column].dtypes == object:
			
@@ -564,15 +566,15 @@ class MigrationCleaning:
 
				         return data
			
 
				 
			
 
				     def clean_json_from_None_object(self, data: pd.DataFrame, clean_bool: bool = True) -> pd.DataFrame():
			
 
				-        
			
 
				+
			
 
				         data = data.to_json(date_format="iso")
			
 
				         data = json.loads(data)
			
 
				         new_data = remap(data, lambda p, k, v: v is not None)
			
 
				         new_data = remap(new_data, lambda p, k, v: v != 'None')
			
 
				         new_data = remap(new_data, lambda p, k, v: v != 'inf')
			
 
				-        # cleans not only bool type also int which are 0 or 1 
			
 
				+        # cleans not only bool type also int which are 0 or 1
			
 
				         # only use if it is necessary have to be change that it only considers
			
 
				-        # Ture and False for bools 
			
 
				+        # Ture and False for bools
			
 
				         if clean_bool:
			
 
				             new_data = remap(new_data, lambda p, k, v: (isinstance(v,bool) or (not isinstance(v,bool) and bool(v))))
			
 
				         return new_data
			
@@ -588,27 +590,27 @@ class MigrationCleaning:
 
				 
			
 
				 
			
 
				     def map_toleranzen_values(self, data: pd.DataFrame, toleranzen: pd.DataFrame):
			
 
				-        
			
 
				+
			
 
				         toleranzen.drop('nr', axis=1, inplace=True)
			
 
				-        
			
 
				+
			
 
				         toleranzen.columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.geometrie.durchmesser.min', 'wellenschenkel.geometrie.durchmesser.max', 'innenring.geometrie.durchmesser.min',
			
 
				                         'innenring.geometrie.durchmesser.max', 'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
			
 
				 
			
 
				-        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max', 
			
 
				+        labyrinten_drop_columns = ['innenring.geometrie.durchmesser.min', 'innenring.geometrie.durchmesser.max',
			
 
				                                     'wellenschenkel_innenring_difference.geometrie.durchmesser.min', 'wellenschenkel_innenring_difference.geometrie.durchmesser.max']
			
 
				-        
			
 
				+
			
 
				         labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.geometrie.durchmesser.min', 'labyrinthring.geometrie.durchmesser.max']
			
 
				-        
			
 
				+
			
 
				         reparatur_stufe_labyrinten_columns= ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'labyrinthring.reparatur_stufe.durchmesser.min', 'labyrinthring.reparatur_stufe.durchmesser.max']
			
 
				 
			
 
				-        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min', 
			
 
				+        reparatur_stufe_columns = ['toleranzbez_wellen_reference', 'toleranzbez_innenring_reference', 'wellenschenkel.reparatur_stufe.durchmesser.min',
			
 
				                                     'wellenschenkel.reparatur_stufe.durchmesser.max', 'innenring.reparatur_stufe.durchmesser.min',
			
 
				-                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min', 
			
 
				+                                    'innenring.reparatur_stufe.durchmesser.max', 'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.min',
			
 
				                                     'wellenschenkel_innenring_difference.reparatur_stufe.durchmesser.max']
			
 
				 
			
 
				-        
			
 
				+
			
 
				         toleranzen_reference_columns = ['wellenschenkel_toleranz', 'labyrinthring_toleranz', 'wellen_reparatur_stufe_toleranz', 'labyrinthring_reparatur_stufe_toleranz']
			
 
				-        
			
 
				+
			
 
				         available_columns = [column for column in data.columns if column in toleranzen_reference_columns]
			
 
				         for column in available_columns:
			
 
				             merge_map = [False] *len(data.index)
			
@@ -623,13 +625,13 @@ class MigrationCleaning:
 
				 
			
 
				                     else:
			
 
				                         temp_toleranzen.columns = labyrinten_columns
			
 
				-                
			
 
				+
			
 
				                 elif 'reparatur_stufe' in column:
			
 
				                     temp_toleranzen.columns = reparatur_stufe_columns
			
 
				                     merge_map = data['innenring_reparatur_stufe_zulaessig'] == 'Ja'
			
 
				                 data_before = len(data.index)
			
 
				                 data = data.merge(temp_toleranzen, how='left', left_on=column, right_on='toleranzbez_wellen_reference')
			
 
				-                data.loc[merge_map, temp_toleranzen.columns] = np.nan 
			
 
				+                data.loc[merge_map, temp_toleranzen.columns] = np.nan
			
 
				                 if data_before != len(data.index):
			
 
				                     print('WEVE LOST DATA!!')
			
 
				                     print('before:', data_before, 'now:', len(data.index))
			
@@ -641,9 +643,9 @@ class MigrationCleaning:
 
				 
			
 
				     def label_is_level(
			
 
				                     self,
			
 
				-                    data: pd.DataFrame, 
			
 
				-                    column: str = "is", 
			
 
				-                    include_schrott: bool = False, 
			
 
				+                    data: pd.DataFrame,
			
 
				+                    column: str = "is",
			
 
				+                    include_schrott: bool = False,
			
 
				                     drop_rows_with_no_is: bool = False) -> pd.DataFrame:
			
 
				         '''
			
 
				         '''
			
@@ -659,16 +661,16 @@ class MigrationCleaning:
 
				                 data.loc[data[column].isin(v), column] = k
			
 
				             else:
			
 
				                 data.loc[data[column].isnull(), column] = k
			
 
				-        
			
 
				+
			
 
				         if include_schrott and ("operation_type_2" in data.columns):
			
 
				             schrott_mask = (data["operation_type_2"] == 2)
			
 
				             data.loc[schrott_mask, column] = 5
			
 
				-        
			
 
				+
			
 
				         data.loc[~data[column].isin([0,1,2,3,4,5]), column] = 0
			
 
				-                    
			
 
				+
			
 
				         if drop_rows_with_no_is:
			
 
				             data = data.loc[data[column] != 0].copy(deep=True)
			
 
				-            
			
 
				+
			
 
				         return data.reset_index(drop=True)
			
 
				 
			
 
				 
			
--- a/cdplib/db_migration/ParseDbSchema.py
+++ b/cdplib/db_migration/ParseDbSchema.py
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
--- a/cdplib/db_migration/ParseMapping.py
+++ b/cdplib/db_migration/ParseMapping.py
--- a/cdplib/feature_engineering/StatisticalFeatures.py
+++ b/cdplib/feature_engineering/StatisticalFeatures.py
@@ -0,0 +1,270 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+""" 
			
 
				+Created on Tue Oct 16 16:08:47 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+import types
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+
			
 
				+from collections import defaultdict
			
 
				+from functools import reduce
			
 
				+
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+from libraries.exception_handling import InputChecks
			
 
				+          
			
 
				+class StatisticalFeatures:
			
 
				+    '''
			
 
				+    Groups data by index columns and returns aggregated statistics for given columns
			
 
				+    
			
 
				+    :param list of tuples or dict index_cols: 
			
 
				+        is either a list of tuples of form: [(colname_1, [aggfunc_1, aggfunc_2]), 
			
 
				+                                             (colname_2, aggfunc_3)]
			
 
				+        or a dictionary of form: {colname_1 : [aggfunc_1, aggfunc_2], colname_2 : aggfunc_3}
			
 
				+        where colname_i is column to aggregate and aggfunc_i are either 
			
 
				+        function variables or strings accepted by pandas for built-in function names.
			
 
				+        REMARQUE: using strings for built-in functions will speed up the calculations by a factor >= 20.
			
 
				+        WARNING: if multiple aggfuncs with the same name are given for a given column (like 'sum' and np.sum),
			
 
				+        then only the first one is kept.
			
 
				+        WARNING: nan values are ignored numpy and pandas built-in aggregation functions.
			
 
				+        
			
 
				+    '''
			
 
				+    def __init__(self, data, index_cols, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        configure_logging(path_to_log)
			
 
				+            
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        
			
 
				+        self.checks = InputChecks(logger = self.logger)
			
 
				+        
			
 
				+        self.data = data
			
 
				+        
			
 
				+        self.checks.assert_correct_type({'data', [pd.DataFrame]})
			
 
				+            
			
 
				+        self.index_cols = index_cols
			
 
				+        
			
 
				+        # make warning about missing values in index columns
			
 
				+        for col in self.index_cols:
			
 
				+            if data[col].isnull().any():
			
 
				+                self.logger.warning('Index column ' + str(col) + ' contains missing values, no features for those will be returned')
			
 
				+
			
 
				+        
			
 
				+    def get_kpis_by_aggregation(self, kpis):
			
 
				+        '''
			
 
				+        Aggregates given fields with given aggregation functions
			
 
				+         USE CASE: per product find mean and standard variation of a price
			
 
				+        
			
 
				+        :param list or dict kpis: either a list of tuples like [(field1, [aggfunc1, aggfunc2]), (field2, aggfunc)]
			
 
				+         or a dictionary like {field1 : [aggfunc1, aggfunc2], field2 : aggfunc}
			
 
				+         where aggfunc-s are reducing functions of either function type or strings standing for functions built in pandas module
			
 
				+         
			
 
				+        :return: features with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        def get_valid_agg_dict_from_kpis(kpis):
			
 
				+            '''
			
 
				+            Filters inputs of incorrect shape or type,
			
 
				+            Filters out columns not present in data
			
 
				+            Removes multiple functions with the same name
			
 
				+            Makes an a quick check that the aggregation with given fields and functions does not fail on the first 2 lines
			
 
				+            Reports to the log
			
 
				+            :param list or dict kpis:
			
 
				+            '''
			
 
				+            def get_name(x):
			
 
				+                '''
			
 
				+                Returns function name for function and does nothing for string
			
 
				+                '''
			
 
				+                if isinstance(x, types.FunctionType):
			
 
				+                    return x.__name__
			
 
				+                else:
			
 
				+                    return x
			
 
				+                
			
 
				+            def passed_first_line_type_control(col, aggfunc):
			
 
				+                '''
			
 
				+                Checks if aggregation works on the first 2 lines of the data
			
 
				+                '''
			
 
				+                try:
			
 
				+                    cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
			
 
				+                    self.data.iloc[:2]\
			
 
				+                             .fillna(value = {c:'nan' for c in  cols_of_object_type})\
			
 
				+                             .groupby(self.index_cols)\
			
 
				+                             .agg({col : aggfunc})
			
 
				+                    return True
			
 
				+                except Exception as e:
			
 
				+                    self.logger.warning('Cannot use aggfunc ' + str(aggfunc) + ' on the column ' + str(col) + ' because of the error : ', str(e))
			
 
				+                    return False
			
 
				+           
			
 
				+            
			
 
				+            
			
 
				+            valid_kpi_dict = defaultdict(list)
			
 
				+            
			
 
				+            if isinstance(kpis, list):
			
 
				+                incorrect_lengths = [len(kpi) !=2 for kpi in kpis]
			
 
				+                if sum(incorrect_lengths) > 0:
			
 
				+                    self.logger.warning('Inputs ' + str(kpis[incorrect_lengths]) + 'do not have correct length.')
			
 
				+                
			
 
				+                cols = list(zip(*kpis))[0]             
			
 
				+                kpis = [t for t in kpis if (len(t) == 2) and (t[0] in self.data.columns)]
			
 
				+            elif isinstance(kpis, dict):
			
 
				+                cols = list(kpis.keys())
			
 
				+                kpis = {k:v for k,v in kpis.items() if k in self.data.columns}.items() 
			
 
				+                
			
 
				+            cols_not_in_data = set(cols) - set(self.data.columns)
			
 
				+            if len(cols_not_in_data) > 0:
			
 
				+                self.logger.warning('Columns ' + ', '.join([str(c) for c in cols_not_in_data]) + ' are not contained in data therefore cannot be used in feature generation.')
			
 
				+                
			
 
				+            for col, aggfuncs in kpis:
			
 
				+                if not isinstance(aggfuncs, list):
			
 
				+                    aggfuncs = [aggfuncs]
			
 
				+                
			
 
				+                for aggfunc in aggfuncs:
			
 
				+                    is_new_funcname = all([get_name(aggfunc) != get_name(f) for f in valid_kpi_dict[col]])
			
 
				+                    if not is_new_funcname:
			
 
				+                        self.logger.warning('Aggfunc ' + str(aggfunc) + ' cannot be used in column ' + str(col) + ', aggfunc with same name is already used.')
			
 
				+                    
			
 
				+                    if passed_first_line_type_control(col, aggfunc) and is_new_funcname:
			
 
				+                        valid_kpi_dict[col].append(aggfunc)
			
 
				+                    
			
 
				+            return valid_kpi_dict
			
 
				+                   
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        agg_dict = get_valid_agg_dict_from_kpis(kpis)
			
 
				+        
			
 
				+        if len(agg_dict) > 0:
			
 
				+        
			
 
				+            new_names = ['_'.join([col, aggfunc.__name__]) if isinstance(aggfunc, types.FunctionType) 
			
 
				+                             else '_'.join([col, str(aggfunc)]) 
			
 
				+                                 for col, aggfuncs in agg_dict.items() for aggfunc in aggfuncs]
			
 
				+            
			
 
				+            cols_of_object_type = set(self.data.columns[self.data.dtypes.eq(object)]) - set(self.index_cols)
			
 
				+            return self.data.fillna(value = {c:'nan' for c in  cols_of_object_type})\
			
 
				+                       .groupby(self.index_cols)\
			
 
				+                       .agg(agg_dict)\
			
 
				+                       .set_axis(new_names, axis = 'columns', inplace = False)\
			
 
				+                       .reset_index()
			
 
				+        else:
			
 
				+            return self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+    def get_value_stats(self, pivot_col, value_col = None, aggfunc = None, entries = None):
			
 
				+        '''
			
 
				+        A wrapper crosstab method with index equal to index_cols
			
 
				+        USE CASE: per product find standart variation of the price in each city
			
 
				+        
			
 
				+        :param str pivot_col: column values of which become columns in the output
			
 
				+        :param str value_col: column name to fillin vlaues
			
 
				+        :param str or func aggfunc: count if None
			
 
				+        :param list entries: values of pivot_col to show
			
 
				+        :return: table with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        
			
 
				+        # assert that types of the inputs are correct
			
 
				+        types_to_check = {'columns' : [str], 
			
 
				+                          'value_col' : [str, type(None)],  
			
 
				+                          'aggfunc' : ['str', types.FunctionType, type(None)], 
			
 
				+                          'entries' : [list, type(None)]}
			
 
				+        
			
 
				+        self.checks.assert_correct_type(types_to_check)
			
 
				+        
			
 
				+        cols_to_check = [pivot_col]
			
 
				+        if not value_col is None:
			
 
				+            cols_to_check.append(value_col)
			
 
				+        self.checks.assert_column_presence(data = self.data, colnames = cols_to_check)        
			
 
				+
			
 
				+        if not entries is None:
			
 
				+            entry_filter = reduce(lambda a,b: a|b, [(self.data[pivot_col] == ent) for ent in entries])
			
 
				+        else:
			
 
				+            entry_filter = pd.Series([True]*len(self.data))              
			
 
				+    
			
 
				+        index = [self.data.loc[entry_filter, col] for col in self.index_cols]
			
 
				+        columns = self.data.loc[entry_filter, pivot_col]
			
 
				+        if not value_col is None:
			
 
				+            value_col = self.data.loc[entry_filter, value_col]
			
 
				+                        
			
 
				+        result = pd.crosstab(index = index, columns = columns, values = value_col, aggfunc = aggfunc)
			
 
				+        result = result.rename(columns = {c : value_col + '_' + str(c) for c in result.columns})\
			
 
				+                       .reset_index()
			
 
				+        return result
			
 
				+    
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+        
			
 
				+    
			
 
				+    def get_aggregated_value_stats(self, pivot_col, value_col = None, aggfunc_step1 = None, aggfuncs_step2 = None, entries = None):
			
 
				+        '''
			
 
				+        Aggregates values obtained with method get_value_stats
			
 
				+         USE CASE: per product find average variation of the price over all cities
			
 
				+         
			
 
				+        :param str pivot_col:
			
 
				+        :param str value_col:
			
 
				+        :param str or func aggfunc_step1: aggfunc used in method get_value_stats
			
 
				+        :param list aggfuncs_step2: aggregation functions used to aggregate the output of method get_value_stats
			
 
				+        :param list entries: 
			
 
				+        :return: table with index- and kpi- columns
			
 
				+        :rtype: pandas DataFrame
			
 
				+        '''
			
 
				+        self.checks.assert_correct_type({'aggfuncs_step2' : [list, type(None)]})
			
 
				+        
			
 
				+        value_stat_kpis = self.get_value_stat_kpis(pivot_col = pivot_col, value_col = value_col, aggfunc = aggfunc_step1, entries = entries)
			
 
				+
			
 
				+        result = value_stat_kpis[self.index_cols].copy(deep = True)
			
 
				+        
			
 
				+        for aggfunc in aggfuncs_step2:
			
 
				+            colname = '_'.join(aggfunc, aggfunc_step1, value_col, pivot_col)
			
 
				+            
			
 
				+            if isinstance(aggfunc, str):
			
 
				+                result[colname] = getattr(value_stat_kpis.set_index(self.index_cols), aggfunc)().reset_index(drop = True)
			
 
				+            else:
			
 
				+                result[colname] = value_stat_kpis.set_index(self.index_cols)\
			
 
				+                                                 .apply(aggfunc, axis = 1)\
			
 
				+                                                 .reset_index(drop = True)
			
 
				+                                                 
			
 
				+        return result
			
 
				+                              
			
 
				+                              
			
 
				+                              
			
 
				+                              
			
 
				+                                                            
			
 
				+    def get_critical_value_stats(self, min_or_max, pivot_col, value_col = None, aggfunc = None):
			
 
				+        '''
			
 
				+        Finds argmin or argmax of a column
			
 
				+         USE CASE: per product find the city with maximum variation of the price
			
 
				+        
			
 
				+        :param str min_or_max: must be in ['min', 'max']
			
 
				+        :param str pivot_col:
			
 
				+        :param str value_col:
			
 
				+        :param str aggfunc:    
			
 
				+        '''
			
 
				+        self.checks.assert_valid_value(arname = 'min_or_max', val = min_or_max, valid_values = ['min', 'max'])
			
 
				+        
			
 
				+        if min_or_max == 'max':
			
 
				+            aggfuncs_step2 = ['idxmax']
			
 
				+        else:
			
 
				+            aggfuncs_step2 = ['idxmin']
			
 
				+            
			
 
				+        return self.get_aggregated_value_stat_kpis(pivot_col = pivot_col, 
			
 
				+                                                   value_col = value_col, 
			
 
				+                                                   aggfunc_step1 = aggfunc, 
			
 
				+                                                   aggfucs_step2 = aggfuncs_step2)
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+        
			
 
				+    # TODO : incorporate frequency, recency of numeric columns crossing a threshold value by default equal to 0.
			
 
				+    
			
 
				+    # can also add pick detection from the other project and calculate the number of picks. Probably first create TimeSeriesManipulation class.
			
 
				+    
			
 
				+    # write tests for all methods
			
--- a/cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
+++ b/cdplib/feature_engineering/StatisticalFeaturesAveragedOverTimePeriods.py
@@ -0,0 +1,77 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Nov  7 15:11:21 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeaturesOverTime
			
 
				+
			
 
				+
			
 
				+class StatisticalFeaturesAveragedOverTimePeriods(StatisticalFeaturesOverTime):
			
 
				+    '''
			
 
				+    '''
			
 
				+    
			
 
				+    def __init__(data, index_cols, date_col, split_date, period_length, past_or_future = 'past', freq = 'days', n_periods = 1, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super(StatisticalFeaturesAveragedOverTimePeriods).__init__(data = data.copy(deep = True),
			
 
				+                                                                   index_cols = index_cols,
			
 
				+                                                                   date_col = date_col,
			
 
				+                                                                   split_date = split_date,
			
 
				+                                                                   period_length = n_periods*period_length,
			
 
				+                                                                   past_or_future = past_or_future,
			
 
				+                                                                   freq = freq,
			
 
				+                                                                   path_to_log)
			
 
				+        
			
 
				+        self.period_number_col = 'period_number'
			
 
				+        while period_number_col in data.columns:
			
 
				+            self.period_number_col += '&'
			
 
				+        
			
 
				+        perid_numbers = self.data[self.index_cols + [date_col]].drop_duplicates()\
			
 
				+                            .groupby(index_cols)[date_col].cumcount()\
			
 
				+                            .reset_index()\
			
 
				+                            .assign(period_number = lambda x: x[0]/period_length)\
			
 
				+                            .rename(columns = {'period_number' : self.period_number_col})
			
 
				+                                       
			
 
				+                
			
 
				+        self.data = pd.merge(self, data, period_numbers, how = 'left', on = self.index_cols)
			
 
				+                            
			
 
				+        self.initial_index_cols = self.index_cols.copy()
			
 
				+        self.index_cols.append(self.period_number_col)
			
 
				+        
			
 
				+        
			
 
				+    def _aggregate_over_time_periods(df):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return df.drop(self.period_number_col, axis = 1)\
			
 
				+                 .groupby(self.initial_index_cols)\
			
 
				+                 .mean()\
			
 
				+                 .reset_index()
			
 
				+        
			
 
				+        
			
 
				+    def get_kpis_by_aggregation(self, **args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                      .get_kpis_by_aggregation(**args))
			
 
				+            
			
 
				+            
			
 
				+    def get_value_stats(self, **args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                 .get_value_stats(**args))
			
 
				+        
			
 
				+        
			
 
				+    def get_aggregated_value_stats(self, args):
			
 
				+        '''
			
 
				+        '''
			
 
				+        return self._aggregate_over_time_periods(super(StatisticalFeaturesAveragedOverTimePeriods)
			
 
				+                                                 .get_aggregated_value_stats(**args))
			
 
				+        
			
 
				+    
			
 
				+        
			
--- a/cdplib/feature_engineering/StatisticalFeaturesOverTime.py
+++ b/cdplib/feature_engineering/StatisticalFeaturesOverTime.py
@@ -0,0 +1,53 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Nov  7 14:02:18 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering import StatisticalFeatures
			
 
				+from libraries.exception_handling import InputChecks, InputCasts
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+
			
 
				+class StatisticalFeaturesOverTime(StatisticalFeatures):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, data, index_cols, date_col, split_date, period_length = None, past_or_future = 'past', freq = 'days', path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        configure_logging(path_to_log)
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        self.checks = InputChecks(logger = self.logger)
			
 
				+        self.casts = InputCasts(logger = self.logger)
			
 
				+        
			
 
				+        self.checks.assert_column_presence(data = data, colnames = [date_col])
			
 
				+        self.assert_valid_value(argname = 'past_or_future', val = past_or_future, valid_values = ['past', 'future'])
			
 
				+        self.assert_valid_value(argname = 'freq', val = freq, valid_values = ['seconds', 'minutes', 'hours', 'days', 'weeks', 'months', 'years'])
			
 
				+        
			
 
				+        
			
 
				+        if past_or_future == 'past':
			
 
				+            if not period_length is None:
			
 
				+                min_date = split_date - pd.DateOffset(**{freq : period_length})
			
 
				+            else:
			
 
				+                min_date = data[date_col].min()
			
 
				+            sup_date = split_date
			
 
				+        else:
			
 
				+            min_date = split_date
			
 
				+            if not period_length is None:
			
 
				+                sup_date = split_date + pd.DateOffset(**{freq : period_length})
			
 
				+            else: 
			
 
				+                sup_date = split_date + pd.DateOffset(**{freq : 1})
			
 
				+            
			
 
				+        split_date = self.casts.cast_arg_to_pandas_datetime(argname = 'split_date', val = split_date)
			
 
				+        data[date_col] = self.casts.cast_column_to_pandas_datetime(series = data[date_col], colname = date_col, all_or_any = 'all')    
			
 
				+        
			
 
				+            
			
 
				+        time_mask = (data[date_col] >= min_date) & (data[date_col] < sup_date)
			
 
				+        
			
 
				+        super(StatisticalFeaturesOverTime).__init__(data = data.loc[time_mask].reset_index(drop = True).copy(deep = True),
			
 
				+                                                    index_cols = index_cols,
			
 
				+                                                    path_to_log = path_to_log)
			
--- a/cdplib/fine_tuning/FineTunedClassiferCV.py
+++ b/cdplib/fine_tuning/FineTunedClassiferCV.py
@@ -0,0 +1,173 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Apr 23 08:51:53 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+
			
 
				+@description: class for fine-tuning a sklearn classifier
			
 
				+(optimizing the probability threshold)
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from typing import Callable
			
 
				+
			
 
				+from sklearn.base import (BaseEstimator, ClassifierMixin,
			
 
				+                          clone, MetaEstimatorMixin)
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+from cdplib.utils.TyperConverter import TypeConverter
			
 
				+
			
 
				+
			
 
				+class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
			
 
				+                            MetaEstimatorMixin):
			
 
				+    """
			
 
				+    Probability threshold tuning for a given estimator.
			
 
				+    Overrides the method predict of the given sklearn classifer
			
 
				+    and returns predictions with the optimal value of
			
 
				+    the probability threshold.
			
 
				+
			
 
				+    An object of this class can be passed to an sklearn Pipeline
			
 
				+    """
			
 
				+    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
			
 
				+                 cv=None, threshold_step: float = 0.1):
			
 
				+        """
			
 
				+        """
			
 
				+        self.estimator = estimator
			
 
				+
			
 
				+        self.is_fitted = False
			
 
				+
			
 
				+        self.greater_is_better = greater_is_better
			
 
				+
			
 
				+        if cv is None:
			
 
				+            self.cv = ...
			
 
				+        else:
			
 
				+            self.cv = cv
			
 
				+
			
 
				+        self.cost_func = cost_func
			
 
				+
			
 
				+        self.threshold_step = threshold_step
			
 
				+
			
 
				+        self.optimal_threshold = 0.5
			
 
				+
			
 
				+        self._logger = Log("FineTunedClassifyCV")
			
 
				+
			
 
				+    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
			
 
				+                            proba_pred: (pd.DataFrame, np.array)):
			
 
				+        '''
			
 
				+        '''
			
 
				+        costs = {}
			
 
				+
			
 
				+        for t in np.arange(self.threshold_step, 1, self.threshold_step):
			
 
				+            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
			
 
				+
			
 
				+        if self.greater_is_better:
			
 
				+            return max(costs, key=costs.get)
			
 
				+        else:
			
 
				+            return min(costs, key=costs.get)
			
 
				+
			
 
				+    def fit(self, X: (pd.DataFrame, np.array),
			
 
				+            y: (pd.DataFrame, np.array) = None,
			
 
				+            **fit_args):
			
 
				+        """
			
 
				+        """
			
 
				+        X = TypeConverter().convert_to_ndarray(X)
			
 
				+        if y is not None:
			
 
				+            y = TypeConverter().convert_to_ndarray(X)
			
 
				+
			
 
				+        optimal_thrs_per_fold = []
			
 
				+
			
 
				+        for train_inds, val_inds in self.cv:
			
 
				+            X_train, X_val = X[train_inds], X[val_inds]
			
 
				+
			
 
				+            if y is not None:
			
 
				+                y_train, y_val = y[train_inds], y[val_inds]
			
 
				+            else:
			
 
				+                y_train, y_val = None, None
			
 
				+
			
 
				+            estimator = clone(fine_tuned_clf.estimator)
			
 
				+
			
 
				+            estimator.fit(X_train, y_train, **fit_args)
			
 
				+
			
 
				+            proba_pred = estimator.predict_proba(X_val)
			
 
				+
			
 
				+            optimal_thr = self._get_best_threshold(y_val, proba_pred)
			
 
				+
			
 
				+            optimal_thrs_per_fold.append(optimal_thr)
			
 
				+
			
 
				+        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
			
 
				+
			
 
				+        self.estimator.fit(X, **fit_args)
			
 
				+
			
 
				+    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
			
 
				+        """
			
 
				+        """
			
 
				+        if self.is_fitted:
			
 
				+
			
 
				+            proba_pred = self.estimator.predict_proba(X)
			
 
				+
			
 
				+            return (proba_pred >= self.optimal_threshold).astype(int)
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.warn("You should fit first")
			
 
				+
			
 
				+    def get_params(self):
			
 
				+        """
			
 
				+        """
			
 
				+        params = self.estimator.get_params()
			
 
				+
			
 
				+        params.update({"cv": self.cv, "cost_func": self.cost_func})
			
 
				+
			
 
				+        return params
			
 
				+
			
 
				+    def set_params(self, **params: dict):
			
 
				+        """
			
 
				+        """
			
 
				+        for param in params:
			
 
				+            if param == "cv":
			
 
				+                self.cv = params[param]
			
 
				+                params.pop(param)
			
 
				+
			
 
				+            elif param == "cost_func":
			
 
				+                self.cost_func = params[param]
			
 
				+                params.pop(param)
			
 
				+
			
 
				+        self.estimator.set_params(**params)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # test
			
 
				+    from sklearn.datasets import load_iris
			
 
				+    from sklearn.metrics import accuracy_score
			
 
				+    import gc
			
 
				+    from xgboost import XGBRFClassifier
			
 
				+
			
 
				+    data = load_iris()
			
 
				+    X, y = data["data"], data["target"]
			
 
				+    y = (y==1).astype(int)
			
 
				+    del data
			
 
				+    gc.collect()
			
 
				+
			
 
				+    # make a custom cv object
			
 
				+    val_len = len(X)//10
			
 
				+    split_inds = range(len(X)//2, len(X), val_len)
			
 
				+
			
 
				+    cv = []
			
 
				+
			
 
				+    for i in split_inds:
			
 
				+        train_inds = list(range(i))
			
 
				+        val_inds = list(range(i, i + val_len))
			
 
				+        cv.append((train_inds, val_inds))
			
 
				+
			
 
				+    clf = XGBRFClassifier()
			
 
				+
			
 
				+    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
			
 
				+                                           cv=cv,
			
 
				+                                           greater_is_better=True,
			
 
				+                                           cost_func=accuracy_score)
			
 
				+
			
 
				+    fine_tuned_clf.fit(X=X, y=y)
			
 
				+
			
--- a/cdplib/gridsearch/GridSearchPipelineSelector.py
+++ b/cdplib/gridsearch/GridSearchPipelineSelector.py
@@ -0,0 +1,375 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 14:15:17 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description:a class for selecting a machine learning
			
 
				+ pipeline from a deterministic space of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is though in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far as well as the entire tuning history
			
 
				+ if needed.
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+from itertools import product
			
 
				+from collections import ChainMap
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from typing import Callable, Optional, Literal, Dict, Union, List
			
 
				+
			
 
				+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
			
 
				+
			
 
				+
			
 
				+class GridSearchPipelineSelector(PipelineSelector):
			
 
				+    """
			
 
				+    A class for selecting a machine learning
			
 
				+     pipeline from a deterministic space of parameter distributions
			
 
				+     over multiple pipelines.
			
 
				+     The selection is though in such a way that a Trials object is being
			
 
				+     maintained during the tuning process from which one can retrieve
			
 
				+     the best pipeline so far as well as the entire tuning history
			
 
				+     if needed.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func: Union[Callable, str],
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: Optional[int] = None,
			
 
				+                 cross_val_averaging_func: Callable = np.mean,
			
 
				+                 additional_metrics: Optional[Dict[str, Callable]] = None,
			
 
				+                 strategy_name: Optional[str] = None,
			
 
				+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
			
 
				+                 = "INFO"
			
 
				+                 ):
			
 
				+        """
			
 
				+        ::param Callable cost_func: function to minimize or maximize
			
 
				+            over the elements of a given (pipeline/hyperparameter) space
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from scratch.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				+
			
 
				+        :param Callable cross_val_averaging_func: Function to aggregate
			
 
				+            the cross-validation scores.
			
 
				+            Example different from the mean: mean - c*var.
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to save
			
 
				+            of the form {"metric_name": metric} where metric is a Callable.
			
 
				+
			
 
				+        :param str strategy_name:
			
 
				+            a strategy is defined by the data set (columns/features and rows),
			
 
				+            cv object, cost function.
			
 
				+            When the strategy changes, one must start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+        try:
			
 
				+
			
 
				+            super().__init__(cost_func=cost_func,
			
 
				+                             greater_is_better=greater_is_better,
			
 
				+                             trials_path=trials_path,
			
 
				+                             backup_trials_freq=backup_trials_freq,
			
 
				+                             cross_val_averaging_func=cross_val_averaging_func,
			
 
				+                             additional_metrics=additional_metrics,
			
 
				+                             strategy_name=strategy_name,
			
 
				+                             stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._logger = Log("GridsearchPipelineSelector: ",
			
 
				+                               stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._trials = self._trials or []
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed initialization. Exit with error: {}".format(e)
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def run_trials(self) -> None:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self.attached_space),\
			
 
				+                "Parameter distribution space must be attached"
			
 
				+
			
 
				+            done_trial_ids = [{"name": trial["name"],
			
 
				+                               "params": trial["params"],
			
 
				+                               "status": trial["status"]}
			
 
				+                              for trial in self._trials]
			
 
				+
			
 
				+            # list (generator) of (flattened) dictionaries
			
 
				+            # with all different combinations of
			
 
				+            # parameters for different pipelines
			
 
				+            # from the space definition.
			
 
				+            space_unfolded = ({"name": param_dist["name"],
			
 
				+                               "pipeline": param_dist["pipeline"],
			
 
				+                               "params": param_set}
			
 
				+                              for param_dist in self._space
			
 
				+                              for param_set in
			
 
				+                              (dict(ChainMap(*tup)) for tup in
			
 
				+                               product(*[[{k: v} for v in
			
 
				+                                          param_dist["params"][k]]
			
 
				+                                         for k in param_dist["params"]])))
			
 
				+
			
 
				+            for space_element in space_unfolded:
			
 
				+
			
 
				+                # uniquely identifies the current space element
			
 
				+                trial_id = {"name": space_element["name"],
			
 
				+                            "params": space_element["params"],
			
 
				+                            "status": 'ok'}
			
 
				+
			
 
				+                # verify if the current pipline/parameters
			
 
				+                # were already tested before
			
 
				+                if trial_id in done_trial_ids:
			
 
				+                    continue
			
 
				+
			
 
				+                result = self._objective(space_element)
			
 
				+
			
 
				+                pipeline = space_element["pipeline"].set_params(
			
 
				+                        **space_element["params"])
			
 
				+
			
 
				+                trial = {"name": space_element["name"],
			
 
				+                         "params": space_element["params"],
			
 
				+                         "pipeline": pipeline}
			
 
				+
			
 
				+                trial.update(result)
			
 
				+
			
 
				+                self._trials.append(trial)
			
 
				+
			
 
				+            self.finished_tuning = True
			
 
				+
			
 
				+            self.total_tuning_time = datetime.datetime.today()\
			
 
				+                - self.start_tuning_time
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to run trials. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def number_of_trials(self) -> Union[int, None]:
			
 
				+        """
			
 
				+        Number of trials already run in the current trials object
			
 
				+        """
			
 
				+        try:
			
 
				+            return len(self._trials)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve the number of trials. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial(self) -> Union[dict, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return max(self._trials, key=lambda x: x["score"])
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> Union[float, None]:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return self.best_trial["score"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> Union[float, None]:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return self.best_trial["score_variance"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return self.best_trial["pipeline"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int)\
			
 
				+            -> Union[List[Pipeline], None]:
			
 
				+        """
			
 
				+        N best pipelines with corresponding
			
 
				+        best hyperparameters
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return [trial["pipeline"] for trial in
			
 
				+                    sorted(self._trials, key=lambda x: x["score"],
			
 
				+                           reverse=True)[:n]]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best trials. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
			
 
				+            -> Union[Dict[str, List[Pipeline]], None]:
			
 
				+        """
			
 
				+        If the hyperparameter search is done over multiple
			
 
				+        pipelines, then returns n different pipeline-types
			
 
				+        with corresponding hyperparameters
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return pd.DataFrame(self._trials)\
			
 
				+                     .sort_values(by=["name", "score"],
			
 
				+                                  ascending=False)\
			
 
				+                     .groupby("name")\
			
 
				+                     .head(n)\
			
 
				+                     .groupby("name")["pipeline"]\
			
 
				+                     .apply(lambda x: list(x))\
			
 
				+                     .to_dict()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best trials of each type."
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def trials_to_excel(self, path: str) -> None:
			
 
				+        """
			
 
				+        Trials object in the shape of table written to excel,
			
 
				+        should contain the run number, pipeline (as str),
			
 
				+        hyperparamters (as str), self.best_result (see self._objective method)
			
 
				+        as well as additional information configured
			
 
				+        through self.save_result method.
			
 
				+        """
			
 
				+        try:
			
 
				+            pd.DataFrame(self._trials).to_excel(path)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to write trials to excel. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    # elementary example
			
 
				+
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from sklearn.metrics import accuracy_score, precision_score
			
 
				+    from cdplib.gridsearch.space_sample import space
			
 
				+    from cdplib.log import Log
			
 
				+    from cdplib.db_handlers import MongodbHandler
			
 
				+    import pickle
			
 
				+    import pandas as pd
			
 
				+    import os
			
 
				+
			
 
				+    trials_path = "gridsearch_trials_TEST.pkl"
			
 
				+    additional_metrics = {"precision": precision_score}
			
 
				+    strategy_name = "strategy_1"
			
 
				+    data_path = "data_TEST.h5"
			
 
				+    cv_path = "cv_TEST.pkl"
			
 
				+    collection_name = 'TEST_' + strategy_name
			
 
				+
			
 
				+    logger = Log("GridSearchPipelineSelector__TEST:")
			
 
				+
			
 
				+    logger.info("Start test")
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
			
 
				+    pd.Series(y).to_hdf(data_path, key="y_train")
			
 
				+
			
 
				+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
			
 
				+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
			
 
				+
			
 
				+    pickle.dump(cv, open(cv_path, "wb"))
			
 
				+
			
 
				+    gs = GridSearchPipelineSelector(cost_func=accuracy_score,
			
 
				+                                    greater_is_better=True,
			
 
				+                                    trials_path=trials_path,
			
 
				+                                    additional_metrics=additional_metrics,
			
 
				+                                    strategy_name=strategy_name,
			
 
				+                                    stdout_log_level="WARNING")
			
 
				+
			
 
				+    gs.attach_space(space=space)
			
 
				+
			
 
				+    gs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
			
 
				+                             cv_pickle_path=cv_path)
			
 
				+
			
 
				+    save_method = MongodbHandler().insert_data_into_collection
			
 
				+    save_kwargs = {'collection_name': collection_name}
			
 
				+
			
 
				+    gs.configer_summary_saving(save_method=save_method,
			
 
				+                               kwargs=save_kwargs)
			
 
				+
			
 
				+    gs.run_trials()
			
 
				+
			
 
				+    logger.info("Best trial: {}".format(gs.best_trial))
			
 
				+    logger.info("Total tuning time: {}".format(gs.total_tuning_time))
			
 
				+
			
 
				+    for file in [trials_path, data_path, cv_path]:
			
 
				+        os.remove(file)
			
 
				+
			
 
				+    logger.info("End test")
			
--- a/cdplib/gridsearch/space_sample.py
+++ b/cdplib/gridsearch/space_sample.py
@@ -0,0 +1,33 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Oct  5 09:50:24 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.feature_selection import SelectPercentile
			
 
				+from sklearn.linear_model import LogisticRegression
			
 
				+from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+
			
 
				+
			
 
				+space = [
			
 
				+        {"name": "std_scaler_kbest_rf",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("kbest", SelectPercentile()),
			
 
				+                 ("rf", RandomForestClassifier())]),
			
 
				+         "params": {"kbest__percentile": [2, 3],
			
 
				+                    "rf__n_estimators": [10, 20]}},
			
 
				+
			
 
				+        {"name": "std_scaler_pca_lr",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("pca", PCA()),
			
 
				+                 ("lr", LogisticRegression())]),
			
 
				+         "params": {"lr__C": [0.5, 1],
			
 
				+                    "pca__n_components": [2, 3]}}
			
 
				+        ]
			
--- a/cdplib/hyperopt/HyperoptPipelineSelection.py
+++ b/cdplib/hyperopt/HyperoptPipelineSelection.py
@@ -1,798 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				-"""
			
 
				-Created on Fri Nov  9 13:27:44 2018
			
 
				-
			
 
				-@author: tanja
			
 
				-@description: Implementation of machine learning
			
 
				-                pipeline selection and tuning with hyperopt library
			
 
				-"""
			
 
				-
			
 
				-import os
			
 
				-import sys
			
 
				-import gc
			
 
				-import logging
			
 
				-import pickle
			
 
				-import time
			
 
				-import datetime
			
 
				-
			
 
				-import pandas as pd
			
 
				-import numpy as np
			
 
				-
			
 
				-from sklearn.pipeline import Pipeline
			
 
				-
			
 
				-from hyperopt import fmin, tpe, rand, Trials, hp, STATUS_OK, STATUS_FAIL,\
			
 
				-    space_eval, pyll
			
 
				-
			
 
				-from sklearn.model_selection import cross_validate
			
 
				-
			
 
				-
			
 
				-class HyperoptPipelineSelection:
			
 
				-    '''
			
 
				-    Use this class to perform a search
			
 
				-    for a machine learning pipeline in a given parameter space.
			
 
				-    The parameter space can include multiple types of Pipelines
			
 
				-    (SVM, XGBOOST, random forest, etc),
			
 
				-    as well as parameter distributions for each pipeline parameter.
			
 
				-    See example in main for the expected space structure.
			
 
				-
			
 
				-    The search can be performed either randomly
			
 
				-    or with a tree-based algorithm. (Other methods are currently
			
 
				-    developped by hyperopt creators).
			
 
				-
			
 
				-    Attribute trials is responsible for book-keeping parameter
			
 
				-    combinations that have already been tried out. This attribute
			
 
				-    is saved to a binary file every n minutes as well as every time
			
 
				-    a better pipeline was found.
			
 
				-    '''
			
 
				-    def __init__(self,
			
 
				-                 cost_func,
			
 
				-                 greater_is_better: bool,
			
 
				-                 trials_path: str,
			
 
				-                 backup_trials_freq: int = 1,
			
 
				-                 log_path: str = None,
			
 
				-                 averaging_func: callable = None):
			
 
				-        '''
			
 
				-        :param callable cost_func: function to minimize or maximize
			
 
				-
			
 
				-        :param bool greater_is_better: when True
			
 
				-            cost_func is maximized, else minimized.
			
 
				-
			
 
				-        :param str trials_path: path at which the trials object is saved
			
 
				-            in binary format. From the trials object we can
			
 
				-            select information about the obtained scores, score variations,
			
 
				-            and pipelines, and parameters tried out so far. If a trials object
			
 
				-            already exists at the given path, it is loaded and the
			
 
				-            search is continued, else, the search is started from
			
 
				-            the beginning.
			
 
				-
			
 
				-        :param backup_trials_freq: frequecy in interations (trials)
			
 
				-            of saving the trials object at the trials_path.
			
 
				-
			
 
				-        :param str log_path: Optional, when not provided logs to stdout.
			
 
				-
			
 
				-        :param callable averaging_func: optional,
			
 
				-            when not provided set to mean. Function
			
 
				-            to aggregate the cross-validated values of the cost function.
			
 
				-            Classic situation is to take the mean,
			
 
				-            another example is, for example mean() - c*var().
			
 
				-        '''
			
 
				-
			
 
				-        assert(callable(cost_func)),\
			
 
				-            "Parameter 'cost_func' must be a callable"
			
 
				-
			
 
				-        assert(isinstance(greater_is_better, bool)),\
			
 
				-            "Parameter 'greater_is_better' must be bool type"
			
 
				-
			
 
				-        assert(isinstance(trials_path, str)),\
			
 
				-            "Parameter 'trials_path' must be of string type"
			
 
				-
			
 
				-        if averaging_func is not None:
			
 
				-            assert(callable(averaging_func)),\
			
 
				-                "Parameter 'averaging_func' must be a callable"
			
 
				-
			
 
				-        self._assert_valid_directory(path=trials_path)
			
 
				-
			
 
				-        self._configer_logger(log_path)
			
 
				-
			
 
				-        self._cost_func = cost_func
			
 
				-        # is 1 when cost_func is minimized, -1 when cost func is maximized
			
 
				-        self._score_factor = (not greater_is_better) - greater_is_better
			
 
				-        self._trials_path = trials_path
			
 
				-        # is initialized with empty trials object
			
 
				-        self._trials = Trials()
			
 
				-        self._backup_trials_freq = backup_trials_freq
			
 
				-        self._averaging_func = averaging_func or np.mean
			
 
				-        # keeping track of the current search iteration
			
 
				-        self._run_number = 0
			
 
				-        # space and data need to be attached to perform search.
			
 
				-        self._space_attached = False
			
 
				-        self._data_attached = False
			
 
				-
			
 
				-        # if a trials object already exists at the given path,
			
 
				-        # it is loaded and the search is continued. Else,
			
 
				-        # the search is started from the beginning.
			
 
				-        if os.path.isfile(trials_path):
			
 
				-            try:
			
 
				-                with open(trials_path, "rb") as f:
			
 
				-                    self._trials = pickle.load(f)
			
 
				-
			
 
				-                self._logger.info(("Loaded an existing trials object"
			
 
				-                                   "Consisting of {} trials")
			
 
				-                                  .format(len(self._trials.trials)))
			
 
				-
			
 
				-            except Exception as e:
			
 
				-                self._logger.error(("Trials object could not be loaded. "
			
 
				-                                    "Training starts from the beginning. "
			
 
				-                                    "Exit with error {}").format(e))
			
 
				-
			
 
				-        else:
			
 
				-            self._logger.info(("No existing trials object was found"
			
 
				-                               "Initialized an empty trials object."))
			
 
				-
			
 
				-        self._best_score = self.best_trial_score
			
 
				-
			
 
				-    def _configer_logger(self, log_path: str = None):
			
 
				-        '''
			
 
				-        Can be replaced with the existing script later.
			
 
				-        When log_path is not provided, logs to stdout.
			
 
				-        '''
			
 
				-
			
 
				-        self._logger = logging.getLogger(__name__)
			
 
				-
			
 
				-        if (self._logger.hasHandlers()):
			
 
				-            self._logger.handlers.clear()
			
 
				-
			
 
				-        if log_path is not None:
			
 
				-            assert(isinstance(log_path, str)),\
			
 
				-                "Parameter 'log_path' must be of string type"
			
 
				-            self._assert_valid_directory(log_path)
			
 
				-
			
 
				-            handler = logging.FileHandler(log_path)
			
 
				-        else:
			
 
				-            handler = logging.StreamHandler(sys.stdout)
			
 
				-
			
 
				-        formatter = logging.Formatter(
			
 
				-                '\n %(asctime)s %(levelname)s %(message)s')
			
 
				-
			
 
				-        handler.setFormatter(formatter)
			
 
				-        self._logger.addHandler(handler)
			
 
				-        self._logger.setLevel("INFO")
			
 
				-
			
 
				-    def _backup_trials(self):
			
 
				-        '''
			
 
				-        Pickles (Saves) the trials object.
			
 
				-        Used in a scheduler.
			
 
				-        '''
			
 
				-        with open(self._trials_path, "wb") as f:
			
 
				-            pickle.dump(self._trials, f)
			
 
				-
			
 
				-    def _assert_valid_directory(self, path: str):
			
 
				-        '''
			
 
				-        If the directory of a path does not exist yet,
			
 
				-        creates it.
			
 
				-        '''
			
 
				-        assert(isinstance(path, str)),\
			
 
				-            "Parameter 'path' must of str type"
			
 
				-
			
 
				-        dirname = os.path.dirname("path")
			
 
				-
			
 
				-        if len(dirname) > 0:
			
 
				-            os.mkdir(dirname, exists_ok=True)
			
 
				-
			
 
				-    def attach_space(self, space: pyll.base.Apply = None,
			
 
				-                     module_path: str = None,
			
 
				-                     name: str = None):
			
 
				-        '''
			
 
				-        :param pyll.base.Apply space: hyperopt space where
			
 
				-            the search is performed. Optional when a space
			
 
				-            is loaded from a python module.
			
 
				-
			
 
				-        :param str module_path: path to python module
			
 
				-            where the space is defined. Optional when
			
 
				-            the space is provided directly.
			
 
				-
			
 
				-        :param str name: name of the space loaded from
			
 
				-            a python module. Optional when the space
			
 
				-            is provided directly.
			
 
				-        '''
			
 
				-        assert((space is not None) or
			
 
				-               ((module_path is not None) and (name is not None))),\
			
 
				-            "Either space or (module_path, name) must be provided"
			
 
				-
			
 
				-        if space is None:
			
 
				-            for p in ["modele_path", "name"]:
			
 
				-                assert(isinstance(p, str)),\
			
 
				-                    "Parameter '{}' must be of str type".format(p)
			
 
				-
			
 
				-            assert(os.path.isfile(module_path)),\
			
 
				-                "Parameter 'module_path' must be a valid file"
			
 
				-
			
 
				-            module, extension = os.path.splitext(os.path.basename(module_path))
			
 
				-            assert(extension == ",py"),\
			
 
				-                "Parameter 'space' must be read from a python file"
			
 
				-
			
 
				-            sys.path.insert(module_path)
			
 
				-
			
 
				-            try:
			
 
				-                from module import name as space
			
 
				-            except ImportError:
			
 
				-                err = "Invalid space location or name"
			
 
				-                self._logger.error(err)
			
 
				-                raise Exception(err)
			
 
				-
			
 
				-        assert(isinstance(space, pyll.base.Apply)),\
			
 
				-            "Parameter 'space' must be of hyperopt space type"
			
 
				-
			
 
				-        self._space = space
			
 
				-        self._logger.info("Attached parameter distribution space")
			
 
				-        self._space_attached = True
			
 
				-
			
 
				-    def _convert_to_array(self, x: (pd.DataFrame, np.ndarray))\
			
 
				-            -> np.ndarray:
			
 
				-        '''
			
 
				-        Converts an DataFrame to an numpy array.
			
 
				-        '''
			
 
				-        if isinstance(x, np.ndarray):
			
 
				-            return x
			
 
				-
			
 
				-        elif (isinstance(x, pd.core.frame.DataFrame))\
			
 
				-                or (isinstance(x, pd.core.series.Series)):
			
 
				-            return x.values
			
 
				-
			
 
				-        else:
			
 
				-            e = 'The argument must be a numpy array or a pandas DataFrame'
			
 
				-            self._logger.critical(e)
			
 
				-            raise ValueError(e)
			
 
				-
			
 
				-    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
			
 
				-                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				-                    X_val: (pd.DataFrame, np.ndarray) = None,
			
 
				-                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
			
 
				-                    cv: (list, int) = None):
			
 
				-        '''
			
 
				-        :param array X_train: data on which
			
 
				-            machine learning pipelines are trained
			
 
				-
			
 
				-        :param array y_train: optional, vector with targets,
			
 
				-            (not all algorithms require a targets)
			
 
				-
			
 
				-        :param array X_val: optional, validation data.
			
 
				-            When not provided, cross-validated value
			
 
				-            of the cost_func is calculated.
			
 
				-
			
 
				-        :param array y_val: optional, validation targets
			
 
				-
			
 
				-        :param list cv: list of tuples containing
			
 
				-            train and validation indices or an integer representing
			
 
				-            the number of folds for a random split of data
			
 
				-            during cross-validation
			
 
				-            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
			
 
				-        '''
			
 
				-
			
 
				-        X_train = self._convert_to_array(X_train)
			
 
				-        if y_train is not None:
			
 
				-            y_train = self._convert_to_array(y_train)
			
 
				-
			
 
				-        if X_val is not None:
			
 
				-            if cv is not None:
			
 
				-                self._logger.warning(("Both validation set and cv object "
			
 
				-                                      "are set. Validation score will be "
			
 
				-                                      "calculated on the validation set!"))
			
 
				-
			
 
				-            X_val = self._convert_to_array(X_val)
			
 
				-
			
 
				-            train_inds = list(range(len(X_train)))
			
 
				-            val_inds = list(range(len(X_train),
			
 
				-                                  len(X_train) + len(X_val)))
			
 
				-
			
 
				-            # cost is evaluated with a cross validation function
			
 
				-            # that accepts an array and a cv object with
			
 
				-            # indices of the fold splits.
			
 
				-            # Here we create a trivial cv object
			
 
				-            # with one validation split.
			
 
				-            self._cv = [(train_inds, val_inds)]
			
 
				-            self._X = np.concatenate([X_train, X_val])
			
 
				-
			
 
				-            if y_train is not None:
			
 
				-                if y_val is None:
			
 
				-                    err = "Argument y_val must be provided"
			
 
				-                    self._logger.critical(err)
			
 
				-                    raise ValueError(err)
			
 
				-                else:
			
 
				-                    y_val = self._convert_to_array(y_val)
			
 
				-                    self._y = np.concatenate([y_train, y_val])
			
 
				-            else:
			
 
				-                self._y = None
			
 
				-        else:
			
 
				-            if cv is None:
			
 
				-                self._logger.warning(("Neither validation set nor cv object "
			
 
				-                                      "are set. Validation score will be "
			
 
				-                                      "calculated on 5 randomly "
			
 
				-                                      "splitted folds."))
			
 
				-
			
 
				-            self._X = X_train
			
 
				-            self._y = y_train
			
 
				-            self._cv = cv
			
 
				-
			
 
				-        self._logger.info("Attached data")
			
 
				-        self._data_attached = True
			
 
				-
			
 
				-    def _evaluate(self, pipeline: Pipeline) -> dict:
			
 
				-        '''
			
 
				-        This method is called in _objective.
			
 
				-
			
 
				-        Calculates the cost on the attached data.
			
 
				-        This function can be overriden, when the cost
			
 
				-        needs to be calculated differently,
			
 
				-        for example with a tensorflow model.
			
 
				-
			
 
				-        :param Pipeline pipeline: machine learning pipeline
			
 
				-            that will be evaluated with cross-validation
			
 
				-
			
 
				-        :output: dictionary with the aggregated
			
 
				-            cross-validation score and
			
 
				-            the score variance.
			
 
				-        '''
			
 
				-
			
 
				-        scores = cross_validate(estimator=pipeline,
			
 
				-                                X=self._X,
			
 
				-                                y=self._y,
			
 
				-                                cv=self._cv or 5,
			
 
				-                                scoring=make_scorer(self._cost_func),
			
 
				-                                error_score=np.nan)
			
 
				-
			
 
				-        return {'value': self._averaging_func(scores['test_score']),
			
 
				-                'variance': np.var(scores['test_score'])}
			
 
				-
			
 
				-    def _objective(self, space_element: dict) -> dict:
			
 
				-        '''
			
 
				-        This method is called in search_for_best_pipeline
			
 
				-        inside the hyperopt fmin method.
			
 
				-
			
 
				-        Uses _evaluate method.
			
 
				-
			
 
				-        It must take as input a space element
			
 
				-        and produce an output in the form of dictionary
			
 
				-        with 2 obligatory values loss and status
			
 
				-        (STATUS_OK or STATUS_FAIL). Other
			
 
				-        values in the output are optional and can be
			
 
				-        accessed later through the trials object.
			
 
				-
			
 
				-        :Warning: fmin minimizes the loss,
			
 
				-        when _evaluate returns a value to be maximized,
			
 
				-        it should be multiplied by -1 to obtain loss.
			
 
				-
			
 
				-        :param dict space_element: must contain keys
			
 
				-            name (with the name of the pipeline),
			
 
				-            pipeline (Pipeline object),
			
 
				-            params (dict of pipeline params)
			
 
				-
			
 
				-        :output: dictionary with keys
			
 
				-            loss (minimized value),
			
 
				-            status with values STATUS_OK or STATUS_FAIL
			
 
				-            uderstood by hyperopt,
			
 
				-            score (equal to loss or -loss),
			
 
				-            score_variance,
			
 
				-            timestamp (end of execution),
			
 
				-            train_time: execution time
			
 
				-        '''
			
 
				-        assert(isinstance(space_element, dict) and
			
 
				-               set(['name', 'pipeline', 'params']) <= space_element.keys())
			
 
				-
			
 
				-        assert(isinstance(space_element['name'], str) and
			
 
				-               isinstance(space_element['pipeline'], Pipeline) and
			
 
				-               isinstance(space_element['params'], dict))
			
 
				-
			
 
				-        start_time = time.time()
			
 
				-
			
 
				-        if not self._data_attached:
			
 
				-            raise Exception(("Data must be attached in order "
			
 
				-                             "in order to effectuate the best"
			
 
				-                             "pipeline search"))
			
 
				-
			
 
				-        self._run_number += 1
			
 
				-
			
 
				-        pipeline = space_element['pipeline']
			
 
				-        params = space_element['params']
			
 
				-        pipeline.set_params(**params)
			
 
				-
			
 
				-        self._logger.info(("Run number {0}: "
			
 
				-                           "Current score is {1}: "
			
 
				-                           "Training pipeline {2} "
			
 
				-                           "with parameters: {3}. ").format(
			
 
				-                             self._run_number,
			
 
				-                             self._best_score,
			
 
				-                             space_element['name'],
			
 
				-                             params))
			
 
				-
			
 
				-        try:
			
 
				-            score_stats = self._evaluate(pipeline)
			
 
				-            assert(not np.isnan(score_stats["value"])),\
			
 
				-                "Returned null score"
			
 
				-
			
 
				-            if self._run_number % self._backup_trials_freq == 0:
			
 
				-                self._backup_trials()
			
 
				-
			
 
				-            if (self._best_score != self._best_score) or\
			
 
				-                self._score_factor*score_stats["value"] <\
			
 
				-                    self._score_factor*self._best_score:
			
 
				-
			
 
				-                self._logger.info("Score got better, new best score is: {}"
			
 
				-                                  .format(score_stats["value"]))
			
 
				-
			
 
				-                self._best_score = score_stats['value']
			
 
				-
			
 
				-                self._backup_trials()
			
 
				-
			
 
				-            end_time = time.time()
			
 
				-
			
 
				-            return {'loss': self._score_factor * score_stats["value"],
			
 
				-                    'status': STATUS_OK,
			
 
				-                    'score': score_stats["value"],
			
 
				-                    'score_variance': score_stats["variance"],
			
 
				-                    'timestamp': datetime.datetime.today(),
			
 
				-                    'train_time': end_time - start_time}
			
 
				-
			
 
				-        except Exception as e:
			
 
				-
			
 
				-            self._logger.warning("Trial failed with error {}".format(e))
			
 
				-
			
 
				-            return {'loss': np.nan,
			
 
				-                    'status': STATUS_FAIL,
			
 
				-                    'score': np.nan,
			
 
				-                    'score_variance': np.nan,
			
 
				-                    'timestamp': datetime.datetime.today(),
			
 
				-                    'train_time': np.nan}
			
 
				-
			
 
				-    def search_for_best_pipeline(self,
			
 
				-                                 niter: int,
			
 
				-                                 algo: callable = tpe.suggest):
			
 
				-        '''
			
 
				-        Method performing the search of the best pipeline in the given space.
			
 
				-        Calls fmin function from the hyperopt library to minimize the output of
			
 
				-        _objective.
			
 
				-
			
 
				-        :params int niter: number of search iterations
			
 
				-        :param callable algo: now can only take values tpe for a tree-based
			
 
				-            random search or random for random search
			
 
				-        '''
			
 
				-        assert(self._space_attached),\
			
 
				-            "Space must be attach to be able to retrieve this information."
			
 
				-
			
 
				-        assert(isinstance(niter, int)),\
			
 
				-            "Parameter 'niter' must be of int type"
			
 
				-
			
 
				-        # right now only two algorithms are provided by
			
 
				-        assert(algo in [tpe.suggest, rand.suggest]),\
			
 
				-            ("Parameter 'algo' can be now only tpe or random. "
			
 
				-             "If other algorithms have been developped by "
			
 
				-             "by hyperopt, plased add them to the list.")
			
 
				-
			
 
				-        try:
			
 
				-            self._logger.info(("Starting {0} iterations of search "
			
 
				-                               "additional to {1} previous"
			
 
				-                               .format(niter, len(self._trials.trials))))
			
 
				-
			
 
				-            best = fmin(fn=self._objective,
			
 
				-                        space=space,
			
 
				-                        algo=algo,
			
 
				-                        trials=self._trials,
			
 
				-                        max_evals=len(self._trials.trials) + niter)
			
 
				-
			
 
				-            # print('AAAA', str(niter))
			
 
				-
			
 
				-            self._logger.info(
			
 
				-                    "Best score is {0} with variance {1}"
			
 
				-                    .format(
			
 
				-                     self._trials.best_trial["result"]["score"],
			
 
				-                     self._trials.best_trial["result"]["score_variance"]))
			
 
				-
			
 
				-            self._logger.info(("Finished {0} iterations of search.\n"
			
 
				-                               "Best parameters are:\n {1} ")
			
 
				-                              .format(niter,
			
 
				-                                      space_eval(space, best)))
			
 
				-
			
 
				-            self._backup_trials()
			
 
				-
			
 
				-        except Exception as e:
			
 
				-            raise ValueError(("Failed to select best "
			
 
				-                             "pipeline! Exit with error: {}").format(e))
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial_score(self) -> float:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if len(self._trials.trials) > 0:
			
 
				-            return self._trials.best_trial["result"]["score"]
			
 
				-        else:
			
 
				-            return np.nan
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial_score_variance(self) -> float:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if len(self._trials.trials) > 0:
			
 
				-            return self._trials.best_trial["result"]["score_variance"]
			
 
				-        else:
			
 
				-            return np.nan
			
 
				-
			
 
				-    @property
			
 
				-    def best_trial_pipeline(self) -> Pipeline:
			
 
				-        '''
			
 
				-        '''
			
 
				-        assert(self._space_attached),\
			
 
				-            "Space must be attach to be able to retrieve this information."
			
 
				-
			
 
				-        if len(self._trials.trials) > 0:
			
 
				-
			
 
				-            return space_eval(
			
 
				-                    space,
			
 
				-                    {k: v[0] for k, v in
			
 
				-                     self._trials.best_trial['misc']['vals'].items()
			
 
				-                     if len(v) > 0})["pipeline"]
			
 
				-        else:
			
 
				-            err = ("Trials object is empty. "
			
 
				-                   "Best pipeline cannot be returned")
			
 
				-
			
 
				-            self._logger.error(err)
			
 
				-            raise Exception(err)
			
 
				-
			
 
				-    def _ith_trial_loss(self, i: int) -> float:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if len(self._trials.trials) >= i:
			
 
				-            return self._trials.trials[i]['result']['loss']
			
 
				-        else:
			
 
				-            return np.nan
			
 
				-
			
 
				-    def _ith_trial_element(self, i: int, name: str) -> object:
			
 
				-        '''
			
 
				-        '''
			
 
				-        assert(self._space_attached),\
			
 
				-            "Space must be attach to be able to retrieve this information."
			
 
				-
			
 
				-        if len(self._trials.trials) >= i:
			
 
				-            return space_eval(self._space,
			
 
				-                              {k: v[0] for k, v in
			
 
				-                               self._trials.trials[i]['misc']['vals']
			
 
				-                               .items() if len(v) > 0})[name]
			
 
				-
			
 
				-    def _ith_trial_pipeline(self, i: int) -> Pipeline:
			
 
				-        '''
			
 
				-        '''
			
 
				-        return self._ith_trial_element(i=i, name='pipeline')
			
 
				-
			
 
				-    def _ith_trial_name(self, i: int) -> str:
			
 
				-        '''
			
 
				-        '''
			
 
				-        return self._ith_trial_element(i=i, name='name')
			
 
				-
			
 
				-    def _ith_trial_params(self, i: int) -> dict:
			
 
				-        '''
			
 
				-        '''
			
 
				-        return self._ith_trial_element(i=i, name='params')
			
 
				-
			
 
				-    def _ith_trial_timestamp(self, i: int) -> datetime.datetime:
			
 
				-        '''
			
 
				-        '''
			
 
				-        if len(self._trials.trials) >= i:
			
 
				-            return self._trials.trials[i]["result"]["timestamp"]
			
 
				-
			
 
				-    def get_n_best_trial_pipelines(self, n: int, losses: list = None) -> list:
			
 
				-        '''
			
 
				-        Returns the list of n best pipelines
			
 
				-        documented in trials
			
 
				-        '''
			
 
				-        if len(self._trials.trials) > 0:
			
 
				-            if losses is None:
			
 
				-                losses = [self._ith_trial_loss(i)
			
 
				-                          for i in range(len(self._trials.trials))]
			
 
				-
			
 
				-            best_n_indices = [losses.index(l)
			
 
				-                              for l in sorted(list(set(losses)))[:n]]
			
 
				-
			
 
				-            return [self._ith_trial_pipeline(i) for i in best_n_indices]
			
 
				-        else:
			
 
				-            err = ("Trials object is empty. "
			
 
				-                   "Best pipeline cannot be returned")
			
 
				-
			
 
				-            self._logger.error(err)
			
 
				-            raise Exception(err)
			
 
				-
			
 
				-    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
			
 
				-        '''
			
 
				-        Returns a dictiionry where keys are pipeline names,
			
 
				-        and values are lists of best pipelines with this name
			
 
				-        '''
			
 
				-        assert(isinstance(n, int)), "Parameter 'n' must be an integer"
			
 
				-
			
 
				-        if len(self._trials.trials) > 0:
			
 
				-
			
 
				-            best_pipelines_per_type = {}
			
 
				-            names = [self._ith_trial_name(i)
			
 
				-                     for i in range(len(self._trials.trials))]
			
 
				-
			
 
				-            for nm in names:
			
 
				-                losses = [self._ith_trial_loss(i)
			
 
				-                          for i in range(len(self._trials.trials))
			
 
				-                          if self._ith_trial_name(i) == nm]
			
 
				-
			
 
				-                best_pipelines_per_type[nm] = self.get_n_best_trial_pipelines(
			
 
				-                                                        n=n,
			
 
				-                                                        losses=losses)
			
 
				-
			
 
				-            return best_pipelines_per_type
			
 
				-
			
 
				-        else:
			
 
				-            err = ("Trials object is empty. "
			
 
				-                   "Best pipeline cannot be returned")
			
 
				-
			
 
				-            self._logger.error(err)
			
 
				-            raise Exception(err)
			
 
				-
			
 
				-    def write_trials_documentation(self, path: str = None):
			
 
				-        '''
			
 
				-        Saves an excel file with pipeline names, scores,
			
 
				-        parameters, and timestamps.
			
 
				-        '''
			
 
				-        if len(self._trials.trials) > 0:
			
 
				-            path = path or "hyperopt_trials_documentation.xlsx"
			
 
				-
			
 
				-            assert(isinstance(path, str)),\
			
 
				-                "Parameter 'path' must be of string type"
			
 
				-
			
 
				-            self._assert_valid_directory(path)
			
 
				-
			
 
				-            names = [self._ith_trial_name(i)
			
 
				-                     for i in range(len(self._trials.trials))]
			
 
				-            scores = [self._score_factor*self._ith_trial_loss(i)
			
 
				-                      for i in range(len(self._trials.trials))]
			
 
				-            params = [self._ith_trial_params(i)
			
 
				-                      for i in range(len(self._trials.trials))]
			
 
				-            timestamps = [self._ith_trial_timestamp(i)
			
 
				-                          for i in range(len(self._trials.trials))]
			
 
				-
			
 
				-        else:
			
 
				-            names = []
			
 
				-            scores = []
			
 
				-            params = []
			
 
				-            timestamps = []
			
 
				-
			
 
				-        pd.DataFrame({"name": names,
			
 
				-                      "score": scores,
			
 
				-                      "params": params,
			
 
				-                      "timestamp": timestamps})\
			
 
				-          .to_excel(path)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-
			
 
				-    from sklearn.metrics import roc_auc_score, make_scorer
			
 
				-    from xgboost import XGBClassifier
			
 
				-    from sklearn.svm import SVC
			
 
				-    from sklearn.feature_selection import SelectKBest
			
 
				-    from sklearn.decomposition import PCA
			
 
				-    from sklearn.datasets import load_iris
			
 
				-    from pprint import pprint
			
 
				-
			
 
				-    data = load_iris()
			
 
				-    X = pd.DataFrame(data.data)
			
 
				-    y = pd.Series(data.target)
			
 
				-    # produce a binory variable
			
 
				-    y = (y == 2).astype(int)
			
 
				-    del data
			
 
				-    gc.collect()
			
 
				-
			
 
				-    # SPACE DEFINITION ########################################
			
 
				-    # (can be moved to a separate python script)
			
 
				-
			
 
				-    """
			
 
				-    A search space must be a list of dictionaries.
			
 
				-    Each dictionry must have keys:
			
 
				-        name (pipeline name or type),
			
 
				-        pipeline (instance of sklearn.pipeline.Pipeline),
			
 
				-        params (dictionary of distributions for the parameters of
			
 
				-                the pipeline that we want to tune)
			
 
				-
			
 
				-    Here we have a space that consists of two dictionaries:
			
 
				-    KBEST_XGBOOST and PCA_SVC
			
 
				-    """
			
 
				-    space = []
			
 
				-
			
 
				-    pipeline_dist_1 = {}
			
 
				-    pipeline_dist_1["name"] = "KBEST_XGBOOST"
			
 
				-
			
 
				-    """
			
 
				-    A pipeline consists of steps (tuples).
			
 
				-    Each step has a name and an algorithm.
			
 
				-    This pipeline, as a first step performs
			
 
				-    feature selection with SelectKBest and
			
 
				-    as a second step evaluates a machine learning algo (xgboost).
			
 
				-
			
 
				-    Like all sklearn algorithms, a Pipeline has methods
			
 
				-    fit, predict, set_params, get_params
			
 
				-    """
			
 
				-    pipeline_dist_1["pipeline"] = Pipeline([
			
 
				-                                     ('kbest', SelectKBest()),
			
 
				-                                     ('xgb', XGBClassifier())
			
 
				-                                     ])
			
 
				-    """
			
 
				-    Pipeline parameter dictionaries must be of the form:
			
 
				-    {'kbest__k': 3, xgb__n_estimators: 20},
			
 
				-    each parameter name consists of the step name, __, and parameter name.
			
 
				-
			
 
				-    Here, instead of values, the parameter names are followed
			
 
				-    by hyperopt distributions.
			
 
				-    Each hyperopt distribution also must have a name,
			
 
				-    due to hyperopt functionality.
			
 
				-
			
 
				-    Here, we set the hyperopt distribution name to the step name,
			
 
				-    but it does not have to be so. Hyperopt distribution names
			
 
				-    must be different for different elements of the space.
			
 
				-    """
			
 
				-
			
 
				-    pipeline_dist_1["params"] = {
			
 
				-            'kbest__k': hp.choice('kbest__k', range(1, 5)),
			
 
				-
			
 
				-            'xgb__n_estimators':
			
 
				-            50 + hp.randint('xgb__n_estimators', 50),
			
 
				-
			
 
				-            "xgb__learning_rate":
			
 
				-            hp.loguniform('xgb__learning_rate', np.log(0.01), np.log(0.2))
			
 
				-            }
			
 
				-
			
 
				-    space.append(pipeline_dist_1)
			
 
				-
			
 
				-    pipeline_dist_2 = {}
			
 
				-    pipeline_dist_2["name"] = "PCA_SVC"
			
 
				-
			
 
				-    pipeline_dist_2["pipeline"] = Pipeline([
			
 
				-                                     ('pca', PCA()),
			
 
				-                                     ('svc', SVC(gamma="scale"))
			
 
				-                                     ])
			
 
				-
			
 
				-    pipeline_dist_2["params"] = {
			
 
				-            "pca__n_components": 1 + hp.randint("pca__n_components", 4),
			
 
				-
			
 
				-            "svc__C": hp.loguniform("svc__C", np.log(0.01), np.log(0.1))
			
 
				-            }
			
 
				-
			
 
				-    space.append(pipeline_dist_2)
			
 
				-
			
 
				-    space = hp.choice('pipelines', space)
			
 
				-
			
 
				-    # TESTING ##########################################################
			
 
				-
			
 
				-    trials_path = 'TEST_hyperopt_trials.pkl'
			
 
				-
			
 
				-    doc_path = 'TEST_hyperopt_doc.xlsx'
			
 
				-
			
 
				-    hp_obj = HyperoptPipelineSelection(cost_func=roc_auc_score,
			
 
				-                                       greater_is_better=True,
			
 
				-                                       trials_path=trials_path)
			
 
				-
			
 
				-    hp_obj.attach_data(X_train=X, y_train=y)
			
 
				-
			
 
				-    hp_obj.attach_space(space=space)
			
 
				-
			
 
				-    hp_obj.search_for_best_pipeline(niter=10)
			
 
				-
			
 
				-    print('\n', '='*20, 'TESTING', '='*20)
			
 
				-
			
 
				-    print('\n', 'Best score:', hp_obj.best_trial_score)
			
 
				-
			
 
				-    print('\n', 'Best score variance:', hp_obj.best_trial_score_variance)
			
 
				-
			
 
				-    print('\n', 'Best pipeline', hp_obj.best_trial_pipeline)
			
 
				-
			
 
				-    print('\n', 'Best 3 pipelines: \n')
			
 
				-    pprint(hp_obj.get_n_best_trial_pipelines(n=3))
			
 
				-
			
 
				-    print('\n', 'Best pipeline per type: \n')
			
 
				-    pprint(hp_obj.get_n_best_trial_pipelines_of_each_type(n=1))
			
 
				-
			
 
				-    hp_obj.write_trials_documentation(path=doc_path)
			
 
				-
			
 
				-    # os.remove(doc_path)
			
 
				-    # os.remove(trials_path)
			
--- a/cdplib/hyperopt/HyperoptPipelineSelector.py
+++ b/cdplib/hyperopt/HyperoptPipelineSelector.py
@@ -0,0 +1,496 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Tue Oct  6 15:04:25 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description:a class for selecting a machine learning
			
 
				+ pipeline from a deterministic space of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is though in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far as well as the entire tuning history
			
 
				+ if needed.
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+import pickle
			
 
				+
			
 
				+from copy import deepcopy
			
 
				+
			
 
				+import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+
			
 
				+from hyperopt import fmin, tpe, rand, Trials, space_eval
			
 
				+
			
 
				+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector,\
			
 
				+     SpaceElementType
			
 
				+
			
 
				+from typing import Callable, Optional, Literal, Dict, Union, List
			
 
				+
			
 
				+
			
 
				+class HyperoptPipelineSelector(PipelineSelector):
			
 
				+    """
			
 
				+    Use this class to perform a search
			
 
				+    for a machine learning pipeline in a given parameter space.
			
 
				+    The parameter space can include multiple types of Pipelines
			
 
				+    (SVM, XGBOOST, random forest, etc),
			
 
				+    as well as parameter distributions for each pipeline parameter.
			
 
				+    See example in main for the expected space structure.
			
 
				+
			
 
				+    The search can be performed either randomly
			
 
				+    or with a tree-based algorithm. (Other methods are currently
			
 
				+    developped by hyperopt creators).
			
 
				+
			
 
				+    Attribute trials is responsible for book-keeping parameter
			
 
				+    combinations that have already been tried out. This attribute
			
 
				+    is saved to a binary file every n minutes as well as every time
			
 
				+    a better pipeline was found.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func: Union[Callable, str],
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: Optional[int] = None,
			
 
				+                 cross_val_averaging_func: Callable = np.mean,
			
 
				+                 additional_metrics: Optional[Dict[str, Callable]] = None,
			
 
				+                 strategy_name: Optional[str] = None,
			
 
				+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
			
 
				+                 = "INFO"):
			
 
				+        """
			
 
				+        param Callable cost_func: function to minimize or maximize
			
 
				+            over the elements of a given (pipeline/hyperparameter) space
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from scratch.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				+
			
 
				+        :param Callable cross_val_averaging_func: Function to aggregate
			
 
				+            the cross-validation scores.
			
 
				+            Example different from the mean: mean - c*var.
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to save
			
 
				+            of the form {"metric_name": metric} where metric is a Callable.
			
 
				+
			
 
				+        :param str strategy_name:
			
 
				+            a strategy is defined by the data set (columns/features and rows),
			
 
				+            cv object, cost function.
			
 
				+            When the strategy changes, one must start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            super().__init__(cost_func=cost_func,
			
 
				+                             greater_is_better=greater_is_better,
			
 
				+                             trials_path=trials_path,
			
 
				+                             backup_trials_freq=backup_trials_freq,
			
 
				+                             cross_val_averaging_func=cross_val_averaging_func,
			
 
				+                             additional_metrics=additional_metrics,
			
 
				+                             strategy_name=strategy_name,
			
 
				+                             stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._logger = Log("HyperoptPipelineSelector: ",
			
 
				+                               stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._trials = self._trials or Trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to intialize. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def run_trials(self,
			
 
				+                   niter: int,
			
 
				+                   algo: Literal[tpe.suggest, rand.suggest] = tpe.suggest)\
			
 
				+            -> None:
			
 
				+        '''
			
 
				+        Method performing the search of the best pipeline in the given space.
			
 
				+        Calls fmin function from the hyperopt library to minimize the output of
			
 
				+        _objective.
			
 
				+
			
 
				+        :params int niter: number of search iterations
			
 
				+        :param algo: now can only take supported by the hyperopt library.
			
 
				+            For now these are tpe.suggest for a tree-based bayesian search
			
 
				+            or rad.suggest for randomized search
			
 
				+        '''
			
 
				+        try:
			
 
				+            self._trials = self._trials or Trials()
			
 
				+
			
 
				+            self._logger.info(("Starting {0} iterations of search "
			
 
				+                               "additional to {1} previous"
			
 
				+                               .format(niter, len(self._trials.trials))))
			
 
				+
			
 
				+            best_trial = fmin(fn=self._objective,
			
 
				+                              space=self._space,
			
 
				+                              algo=algo,
			
 
				+                              trials=self._trials,
			
 
				+                              max_evals=len(self._trials.trials) + niter)
			
 
				+
			
 
				+            self._logger.info(
			
 
				+                    "Best score is {0} with variance {1}"
			
 
				+                    .format(
			
 
				+                     self._trials.best_trial["result"]["score"],
			
 
				+                     self._trials.best_trial["result"]["score_variance"]))
			
 
				+
			
 
				+            self._logger.info(("Finished {0} iterations of search.\n"
			
 
				+                               "Best parameters are:\n {1} ")
			
 
				+                              .format(niter,
			
 
				+                                      space_eval(self._space, best_trial)))
			
 
				+
			
 
				+            self.finished_tuning = True
			
 
				+
			
 
				+            self.total_tuning_time = datetime.datetime.today()\
			
 
				+                - self.start_tuning_time
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to select best "
			
 
				+                   "pipeline! Exit with error: {}").format(e)
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def number_of_trials(self) -> Union[int, None]:
			
 
				+        """
			
 
				+        :return: number of trials run so far
			
 
				+            with the given Trials object
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+            return len(self._trials.trials)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve the number of trials. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _get_space_element_from_trial(self, trial: Dict)\
			
 
				+            -> Union[Dict[SpaceElementType], None]:
			
 
				+        """
			
 
				+        Hyperopt trials object does not contain the space
			
 
				+             elements that result in the corresponding trials.
			
 
				+             One has to use the function space_eval from
			
 
				+             hyperopt to get the space element.
			
 
				+
			
 
				+        After retrieving the space element,
			
 
				+            parameters of the pipeline are set.
			
 
				+        """
			
 
				+        try:
			
 
				+            trial = deepcopy(trial)
			
 
				+
			
 
				+            assert(self.attached_space),\
			
 
				+                "Hyperparameter space not attached."
			
 
				+
			
 
				+            space_element = space_eval(self._space,
			
 
				+                                       {k: v[0] for k, v in
			
 
				+                                        trial['misc']['vals'].items()
			
 
				+                                        if len(v) > 0})
			
 
				+
			
 
				+            pipeline = deepcopy(space_element["pipeline"])
			
 
				+            params = deepcopy(space_element["params"])
			
 
				+            pipeline.set_params(**params)
			
 
				+
			
 
				+            space_element["pipeline"] = pipeline
			
 
				+
			
 
				+            return space_element
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve a space element from a trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _get_space_element_from_index(self, i: int)\
			
 
				+            -> Union[Dict[SpaceElementType], None]:
			
 
				+        """
			
 
				+        Gets the space element of shape
			
 
				+        {"name": NAME, "params": PARAMS, "pipeline": PIPELINE}
			
 
				+        from the trial number i.
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials.trials) > i),\
			
 
				+                ("Trials object is not long enough "
			
 
				+                 "to retrieve index {}".format(i))
			
 
				+
			
 
				+            return self._get_space_element_from_trial(self._trials.trials[i])
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to get space element from index. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _get_pipeline_from_index(self, i: int) -> Union[Pipeline, None]:
			
 
				+        """
			
 
				+        Gets a pipeline with set parameters from the trial number i
			
 
				+        """
			
 
				+        try:
			
 
				+            space_element = self._get_space_element_from_index(i)
			
 
				+
			
 
				+            return space_element["pipeline"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve pipeline from index. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial(self) -> Union[Dict, None]:
			
 
				+        """
			
 
				+        :return: dictionary with the summary of the best trial
			
 
				+            and space element (name, pipeline, params)
			
 
				+            resulting in the best trial
			
 
				+        """
			
 
				+        if len(self._trials.trials) == 0:
			
 
				+
			
 
				+            self._logger.log_and_throw_warning("Trials object is empty")
			
 
				+            return {}
			
 
				+
			
 
				+        else:
			
 
				+
			
 
				+            try:
			
 
				+                best_trial = deepcopy(self._trials.best_trial)
			
 
				+
			
 
				+                if self.attached_space:
			
 
				+
			
 
				+                    space_element = self._get_space_element_from_trial(
			
 
				+                            best_trial)
			
 
				+                else:
			
 
				+                    space_element = {}
			
 
				+
			
 
				+                    warn = ("Space is not attached, "
			
 
				+                            "To included the best pipeline "
			
 
				+                            "attach the space")
			
 
				+                    self._logger.log_and_throw_warning(warn)
			
 
				+
			
 
				+                best_trial = deepcopy(self._trials.best_trial["result"])
			
 
				+
			
 
				+                best_trial.update(space_element)
			
 
				+
			
 
				+                return best_trial
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = "Failed to retrieve best trial. Exit with error: {}"\
			
 
				+                    .format(e)
			
 
				+
			
 
				+                self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> Union[float, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self.best_trial) > 0:
			
 
				+                return self.best_trial["score"]
			
 
				+            else:
			
 
				+                return np.nan
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve best trial score. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> Union[float, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self.best_trial) > 0:
			
 
				+                return self.best_trial["score_variance"]
			
 
				+            else:
			
 
				+                return np.nan
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve best trial score variance. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self.best_trial) > 0:
			
 
				+                return self.best_trial["pipeline"]
			
 
				+            else:
			
 
				+                return np.nan
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve best trial pipeline. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int)\
			
 
				+            -> Union[List[Pipeline], None]:
			
 
				+        """
			
 
				+        :return: the list of n best pipelines
			
 
				+        documented in trials
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self._trials.trials) == 0:
			
 
				+                return []
			
 
				+            else:
			
 
				+                n_best_trials = sorted(self._trials.trials,
			
 
				+                                       key=lambda x: x["result"]["score"],
			
 
				+                                       reverse=True)[:n]
			
 
				+
			
 
				+                return [self._get_space_element_from_trial(trial)["pipeline"]
			
 
				+                        for trial in n_best_trials]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best pipelines. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
			
 
				+            -> Union[Dict[str, List[Pipeline]], None]:
			
 
				+        """
			
 
				+        :return: a dictiionry where keys are pipeline names,
			
 
				+        and values are lists of best pipelines with this name
			
 
				+        """
			
 
				+        try:
			
 
				+            scores = [trial["result"]["score"]
			
 
				+                      for trial in self._trials.trials]
			
 
				+
			
 
				+            names = [self._get_space_element_from_trial(trial)["name"]
			
 
				+                     for trial in self._trials.trials]
			
 
				+
			
 
				+            return pd.DataFrame({"name": names, "score": scores})\
			
 
				+                     .sort_values(by=["name", "score"], ascending=False)\
			
 
				+                     .groupby("name")\
			
 
				+                     .head(n)\
			
 
				+                     .reset_index()\
			
 
				+                     .assign(pipeline=lambda x: x["index"]
			
 
				+                             .apply(self._get_pipeline_from_index))\
			
 
				+                     .groupby("name")["pipeline"]\
			
 
				+                     .apply(lambda x: list(x))\
			
 
				+                     .to_dict()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to get n best pipelines of each type. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def trials_to_excel(self, path: str = None) -> None:
			
 
				+        """
			
 
				+        Saves an excel file with pipeline names, scores,
			
 
				+        parameters, and timestamps.
			
 
				+        """
			
 
				+        try:
			
 
				+            results = [trial["result"] for trial in self._trials.trials]
			
 
				+
			
 
				+            space_elements = [self._get_space_element_from_trial(trial)
			
 
				+                              for trial in self._trials.trials]
			
 
				+
			
 
				+            pd.DataFrame([{**result, **space_element}
			
 
				+                          for result, space_element in
			
 
				+                          zip(results, space_elements)]).to_excel(path)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to write trials to excel. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    # elementary example
			
 
				+
			
 
				+    from sklearn.metrics import roc_auc_score, precision_score
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from cdplib.log import Log
			
 
				+    from cdplib.db_handlers import MongodbHandler
			
 
				+    from cdplib.hyperopt.space_sample import space
			
 
				+    # from cdplib.hyperopt.composed_space_sample import space
			
 
				+
			
 
				+    trials_path = "hyperopt_trials_TEST.pkl"
			
 
				+    additional_metrics = {"precision": precision_score}
			
 
				+    strategy_name = "strategy_1"
			
 
				+    data_path = "data_TEST.h5"
			
 
				+    cv_path = "cv_TEST.pkl"
			
 
				+    collection_name = 'TEST_' + strategy_name
			
 
				+
			
 
				+    logger = Log("HyperoptPipelineSelector__TEST:")
			
 
				+
			
 
				+    logger.info("Start test")
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
			
 
				+    pd.Series(y).to_hdf(data_path, key="y_train")
			
 
				+
			
 
				+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
			
 
				+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
			
 
				+
			
 
				+    pickle.dump(cv, open(cv_path, "wb"))
			
 
				+
			
 
				+    hs = HyperoptPipelineSelector(cost_func=roc_auc_score,
			
 
				+                                  greater_is_better=True,
			
 
				+                                  trials_path=trials_path,
			
 
				+                                  additional_metrics=additional_metrics,
			
 
				+                                  strategy_name=strategy_name,
			
 
				+                                  stdout_log_level="WARNING")
			
 
				+
			
 
				+    hs.attach_space(space=space)
			
 
				+
			
 
				+    hs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
			
 
				+                             cv_pickle_path=cv_path)
			
 
				+
			
 
				+    try:
			
 
				+
			
 
				+        # TODO: this line causes a pytype to throw not-callable error
			
 
				+        # works fine with pytype on other class methods.
			
 
				+        save_method = MongodbHandler().insert_data_into_collection
			
 
				+        save_kwargs = {'collection_name': collection_name}
			
 
				+
			
 
				+        # save_method = pd.DataFrame.to_excel()
			
 
				+        # save_kwargs = {'excel_writer': "TEST.xlsx"}
			
 
				+
			
 
				+        hs.configer_summary_saving(save_method=save_method,
			
 
				+                                   kwargs=save_kwargs)
			
 
				+
			
 
				+        logger.info("Configured summary saving in mongo")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+
			
 
				+        logger.warning(("Could not configure summary saving in mongo. "
			
 
				+                        "Exit with error: {}".format(e)))
			
 
				+
			
 
				+    hs.run_trials(niter=10)
			
 
				+
			
 
				+    logger.info("Best Trial: {}".format(hs.best_trial))
			
 
				+    logger.info("Total tuning time: {}".format(hs.total_tuning_time))
			
 
				+
			
 
				+    for file in [trials_path, data_path, cv_path]:
			
 
				+        os.remove(file)
			
 
				+
			
 
				+    logger.info("End test")
			
--- a/cdplib/hyperopt/__init__.py
+++ b/cdplib/hyperopt/__init__.py
--- a/cdplib/hyperopt/composed_space_sample.py
+++ b/cdplib/hyperopt/composed_space_sample.py
@@ -0,0 +1,116 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Jul  6 14:02:24 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: space object to pass to HyperoptPipelineSelection class
			
 
				+"""
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
			
 
				+    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
			
 
				+from xgboost import XGBRFClassifier
			
 
				+from sklearn.svm import SVC
			
 
				+from sklearn.linear_model import LogisticRegression
			
 
				+from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+from hyperopt import hp
			
 
				+
			
 
				+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
			
 
				+
			
 
				+# TODO: add sample spaces for encoders and transformers
			
 
				+
			
 
				+encoders = []
			
 
				+
			
 
				+transformers = []
			
 
				+
			
 
				+selectors = [
			
 
				+    {"name": "kbest",
			
 
				+     "object": SelectPercentile(),
			
 
				+     "params": {
			
 
				+       "percentile": 3 + hp.randint("kbest__percentile", 60),
			
 
				+       "score_func": hp.choice("kbest__score_func",
			
 
				+                               [f_classif, chi2, mutual_info_classif])}},
			
 
				+
			
 
				+    {"name": "fpr",
			
 
				+     "object": SelectFpr(),
			
 
				+     "params": {
			
 
				+        "score_func": hp.choice("fpr__score_func",
			
 
				+                                [f_classif, chi2]),
			
 
				+        # mutual_info_classif does not work here
			
 
				+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
			
 
				+
			
 
				+    {"name": "rfe_rf",
			
 
				+     "object":
			
 
				+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
			
 
				+     "params": {
			
 
				+         "n_features_to_select":
			
 
				+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
			
 
				+         "estimator__n_estimators":
			
 
				+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
			
 
				+
			
 
				+    {"name": "rfm_rf",
			
 
				+     "object":
			
 
				+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
			
 
				+                                                          random_state=33)),
			
 
				+     "params": {
			
 
				+         "estimator__n_estimators":
			
 
				+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
			
 
				+
			
 
				+    {"name": "rfm_lr",
			
 
				+     "object":
			
 
				+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
			
 
				+                                                      random_state=33)),
			
 
				+     "params": {
			
 
				+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
			
 
				+
			
 
				+    {"name": "std_scaler_pca",
			
 
				+     "object": Pipeline([
			
 
				+             ("scaler", StandardScaler()),
			
 
				+             ("pca", PCA(random_state=33))]),
			
 
				+     "params": {
			
 
				+        "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
			
 
				+       }}
			
 
				+    ]
			
 
				+
			
 
				+models = [
			
 
				+        {"name": "xgb",
			
 
				+         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
			
 
				+         "params": {
			
 
				+           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
			
 
				+           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
			
 
				+           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
			
 
				+           }},
			
 
				+
			
 
				+        {"name": "rf",
			
 
				+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
			
 
				+         "params": {
			
 
				+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
			
 
				+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
			
 
				+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
			
 
				+           }},
			
 
				+
			
 
				+        # the default solver does not accept l1 penalty
			
 
				+        {"name": "lr",
			
 
				+         "object": LogisticRegression(random_state=33,
			
 
				+                                      solver='liblinear',
			
 
				+                                      # n_jobs=-1
			
 
				+                                      ),
			
 
				+         "params":  {
			
 
				+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
			
 
				+           "C": hp.uniform("lr__C", 0.1, 1000)}},
			
 
				+
			
 
				+        # svc does not support parallelizaiton, therefore is slow
			
 
				+        {"name": "svc",
			
 
				+         "object": SVC(random_state=33),
			
 
				+         "params": {
			
 
				+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
			
 
				+            "degree": 2 + hp.randint("svc__degree", 3),
			
 
				+            "C": hp.uniform("svc__C", 0.1, 1000)
			
 
				+            }}
			
 
				+        ]
			
 
				+
			
 
				+step_list = [encoders, transformers, selectors, models]
			
 
				+
			
 
				+space = SpaceComposer().compose_hyperopt_space(step_list)
			
--- a/cdplib/hyperopt/space_sample.py
+++ b/cdplib/hyperopt/space_sample.py
@@ -0,0 +1,40 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Oct  5 09:50:24 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.feature_selection import SelectPercentile
			
 
				+from sklearn.linear_model import LogisticRegression
			
 
				+from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+from hyperopt import hp
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+space = hp.choice("pipelines", [
			
 
				+
			
 
				+        {"name": "std_scaler_kbest_rf",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("kbest", SelectPercentile()),
			
 
				+                 ("rf", RandomForestClassifier())]),
			
 
				+         "params": {"kbest__percentile":
			
 
				+                    hp.choice('kbest__percentile', range(1, 3)),
			
 
				+                    "rf__n_estimators":
			
 
				+                    50 + hp.randint('rf__n_estimators', 50)}},
			
 
				+
			
 
				+        {"name": "std_scaler_pca_lr",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("pca", PCA()),
			
 
				+                 ("lr", LogisticRegression())]),
			
 
				+         "params": {"lr__C":
			
 
				+                    hp.loguniform("lr__C", np.log(0.01), np.log(0.1)),
			
 
				+                    "pca__n_components":
			
 
				+                    1 + hp.randint("pca__n_components", 4)}}
			
 
				+        ])
			
--- a/cdplib/hyperparameter_space_composer/SpaceComposer.py
+++ b/cdplib/hyperparameter_space_composer/SpaceComposer.py
@@ -0,0 +1,85 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 13:54:04 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: a class that from a given list of pipeline steps
			
 
				+ composes a space to be passed in the GridsearchPipelineSelector
			
 
				+ or HyperoptPipelineSelector classes.
			
 
				+ A classic list of steps would be: [encoders, transformers, selectors, models]
			
 
				+"""
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from hyperopt import hp
			
 
				+from itertools import product
			
 
				+
			
 
				+
			
 
				+class SpaceComposer:
			
 
				+    """
			
 
				+    A class that from a given list of pipeline steps
			
 
				+    composes a space to be passed to GridsearchPipelineSelector
			
 
				+    or HyperoptPipelineSelector.
			
 
				+    """
			
 
				+    def compose_gridsearch_space(self, step_list: list) -> list:
			
 
				+        """
			
 
				+        Composes a hyperparameter space for input to the
			
 
				+        GridsearchPipelineSelector class.
			
 
				+
			
 
				+        :param step_list: a classic list of steps would be
			
 
				+        [encoders, transformers, selectors, models],
			
 
				+        where, for example, selectors is a list
			
 
				+        of sklearn feature selectors, each selector given as a dict:
			
 
				+        for example {"name": "kbest",
			
 
				+                     "object": SelectPercentile(),
			
 
				+                     "params": {
			
 
				+                             "percentile":
			
 
				+                                 [5, 10, 20],
			
 
				+                             "score_func":
			
 
				+                                 [f_classif, chi2, mutual_info_classif]}}
			
 
				+
			
 
				+        :return: a list of dictionaries of form
			
 
				+            {"name": NAME, "pipeline": PIPELINE, "params": PARAMS}
			
 
				+        """
			
 
				+        space = []
			
 
				+
			
 
				+        step_combinations = product(*[step for step in
			
 
				+                                      step_list if len(step) > 0])
			
 
				+
			
 
				+        for step_combination in step_combinations:
			
 
				+
			
 
				+            space_element = {}
			
 
				+
			
 
				+            space_element["name"] = "_".join([step["name"]
			
 
				+                                              for step in step_combination])
			
 
				+
			
 
				+            space_element["pipeline"] = Pipeline(
			
 
				+                    [(step["name"], step["object"])
			
 
				+                     for step in step_combination])
			
 
				+
			
 
				+            space_element["params"] =\
			
 
				+                {step["name"] + "__" + param_name: param_dist
			
 
				+                 for step in step_combination
			
 
				+                 for param_name, param_dist
			
 
				+                 in step["params"].items()}
			
 
				+
			
 
				+            space.append(space_element)
			
 
				+
			
 
				+        return space
			
 
				+
			
 
				+    def compose_hyperopt_space(self, step_list: list) -> hp.choice:
			
 
				+        """
			
 
				+        Composes a hyperopt space from a list of steps.
			
 
				+        A classic list of steps would be
			
 
				+        [encoders, transformers, selectors, models],
			
 
				+        where, for example, selectors is a list
			
 
				+        of sklearn feature selectors, each selector given as a dict:
			
 
				+        for example {"name": "kbest",
			
 
				+                     "object": SelectPercentile(),
			
 
				+                     "params": {
			
 
				+                             "percentile":
			
 
				+                                 3 + hp.randint("kbest__percentile", 200),
			
 
				+                             "score_func":
			
 
				+                                 hp.choice("kbest__score_func",
			
 
				+                                    [f_classif, chi2, mutual_info_classif])}}
			
 
				+        """
			
 
				+        return hp.choice("pipelines", self.compose_gridsearch_space(step_list))
			
--- a/cdplib/log.py
+++ b/cdplib/log.py
@@ -6,6 +6,7 @@
 
				 import sys
			
 
				 import os
			
 
				 import logging
			
 
				+import warnings
			
 
				 from datetime import datetime
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
@@ -15,6 +16,7 @@ class Log():
 
				     '''
			
 
				     '''
			
 
				     pass
			
 
				+
			
 
				     def __init__(self, name: str = None,
			
 
				                  log_file: str = None,
			
 
				                  log_level: str = "ERROR",
			
@@ -29,7 +31,6 @@ class Log():
 
				             name = ''
			
 
				 
			
 
				         self._logger = logging.getLogger(name)
			
 
				-        
			
 
				 
			
 
				         self._logger.setLevel("DEBUG")
			
 
				 
			
@@ -37,7 +38,9 @@ class Log():
 
				             self._logger.handlers.clear()
			
 
				 
			
 
				         if log_file is None:
			
 
				-            log_file = os.path.join(".", "logs", str(datetime.today().date()) + ".log")
			
 
				+            log_file = os.path.join(".",
			
 
				+                                    "logs",
			
 
				+                                    str(datetime.today().date()) + ".log")
			
 
				 
			
 
				         assert(isinstance(log_file, str)),\
			
 
				             "Parameter 'log_path' must be of string type"
			
@@ -60,7 +63,6 @@ class Log():
 
				 
			
 
				         # self._logger.setLevel(log_level)
			
 
				 
			
 
				-
			
 
				     @property
			
 
				     def magenta(self):
			
 
				         return '\033[95m'
			
@@ -97,7 +99,6 @@ class Log():
 
				     def underline(self):
			
 
				         return '\033[4m'
			
 
				 
			
 
				-
			
 
				     def info(self, message: str):
			
 
				         self._logger.info(message)
			
 
				 
			
@@ -107,23 +108,23 @@ class Log():
 
				     def error(self, message: str):
			
 
				         self._logger.error(message)
			
 
				 
			
 
				-    def log_and_raise_error(self, message):
			
 
				+    def log_and_raise_error(self, message, ErrorType=Exception):
			
 
				         '''
			
 
				         '''
			
 
				         self._logger.error(message, exc_info=True)
			
 
				 
			
 
				-        raise Exception(message)
			
 
				+        raise ErrorType(message)
			
 
				 
			
 
				-    def log_and_raise_error_stack_info(self, message):
			
 
				+    def log_and_raise_error_stack_info(self, message, ErrorType=Exception):
			
 
				         '''
			
 
				         '''
			
 
				         self._logger.error(message, exc_info=True, stack_info=True)
			
 
				 
			
 
				-        raise Exception(message)
			
 
				+        raise ErrorType(message)
			
 
				 
			
 
				-    def log_and_raise_warning(self, message):
			
 
				+    def log_and_throw_warning(self, message):
			
 
				         '''
			
 
				         '''
			
 
				-        self._logger.warning(message)
			
 
				+        self._logger.warning(message, exc_info=True)
			
 
				 
			
 
				-        raise Warning(message)
			
 
				+        warnings.warn(message)
			
--- a/cdplib/ml_validation/CVComposer.py
+++ b/cdplib/ml_validation/CVComposer.py
@@ -0,0 +1,208 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Dec  9 10:27:39 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from typing import Union, Iterable, Tuple, List, NewType
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from itertools import accumulate, repeat, takewhile, chain
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+CVType = NewType("CVType", Iterable[Tuple[List]])
			
 
				+
			
 
				+DataSetType = NewType("DataSetType",
			
 
				+                      Union[pd.DataFrame, pd.Sereis, np.ndarray, List])
			
 
				+
			
 
				+
			
 
				+class CVComposer:
			
 
				+    """
			
 
				+    Groups methods for composing cv objects
			
 
				+    that follow standards from sklearn,
			
 
				+    these cv objects can be passed to algorithms like gridsearch, etc
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        """
			
 
				+        """
			
 
				+        self._logger = Log("CVComposer: ")
			
 
				+
			
 
				+    def dummy_cv(
			
 
				+            self,
			
 
				+            train_set_size: Union[int, None] = None,
			
 
				+            train_index: Union[pd.Series, np.ndarray, None] = None,
			
 
				+            test_set_size: Union[int, None] = None,
			
 
				+            test_index: DataSetType = None) -> CVType:
			
 
				+        """
			
 
				+        """
			
 
				+        assert((train_index is None) != (train_set_size is None)),\
			
 
				+            "Set train_index or train_set_size"
			
 
				+
			
 
				+        assert((test_index is None) != (test_set_size is None)),\
			
 
				+            "Set train_index or train_set_size"
			
 
				+
			
 
				+        train_index = train_index if (train_index is not None)\
			
 
				+            else list(range(train_set_size))
			
 
				+
			
 
				+        test_index = test_index if (test_index is not None)\
			
 
				+            else list(range(train_set_size, train_set_size + test_set_size))
			
 
				+
			
 
				+        return [(train_index, test_index)]
			
 
				+
			
 
				+    def dummy_cv_and_concatenated_data_set(
			
 
				+            self,
			
 
				+            X_train: DataSetType,
			
 
				+            y_train: Union[DataSetType, None] = None,
			
 
				+            X_test: DataSetType,
			
 
				+            y_test: Union[DataSetType, None] = None)\
			
 
				+            -> Tuple[DataSetType, DataSetType, CVType]:
			
 
				+        """
			
 
				+        """
			
 
				+        assert((y_test is None) == (y_train is None))
			
 
				+
			
 
				+        use_index = (isinstance(X_train, pd.DataFrame) and
			
 
				+                     isinstance(X_test, pd.DataFrame) and
			
 
				+                     (len(set(X_train.index) and set(X_test.index)) == 0))
			
 
				+
			
 
				+        if use_index:
			
 
				+
			
 
				+            cv = self.dummy_cv(train_index=X_train.index,
			
 
				+                               test_index=X_test.index)
			
 
				+
			
 
				+            X = pd.concat([X_train, X_test], ignore_index=False, axis=0)
			
 
				+
			
 
				+        else:
			
 
				+            cv = self.dummy_cv(train_size=len(X_train),
			
 
				+                               test_size=len(X_test))
			
 
				+
			
 
				+            X = np.concatenate([X_train, X_test])
			
 
				+
			
 
				+        use_target_index = use_index and (
			
 
				+                    isinstance(y_train, pd.Series) and
			
 
				+                    isinstance(y_test, pd.Series) and
			
 
				+                    (X_train.index.equals(y_train.index)) and
			
 
				+                    (X_test.index.equals(y_test.index)))
			
 
				+
			
 
				+        if use_target_index:
			
 
				+
			
 
				+            y = pd.concat([y_train, y_test], ignore_index=False, axis=0)
			
 
				+
			
 
				+        else:
			
 
				+
			
 
				+            y = np.concatenate([y_train, y_test]) if (y_train is not None)\
			
 
				+                else None
			
 
				+
			
 
				+        result_to_np = (
			
 
				+            (isinstance(X_train, pd.DataFrame) !=
			
 
				+             isinstance(X_test, pd.DataFrame)) or
			
 
				+            (isinstance(X_train, pd.DataFrame)) and
			
 
				+            (len(set(X_train.index) and set(X_test.index)) != 0))
			
 
				+
			
 
				+        if result_to_np:
			
 
				+            self._logger.log_and_throw_warning(
			
 
				+                    "The concatenated dataframe is converted to numpy")
			
 
				+
			
 
				+        return cv, X, y
			
 
				+
			
 
				+    def expanding_cv(self, test_proportion: float,
			
 
				+                     start_train_proportion: float,
			
 
				+                     step_proportion: float = None,
			
 
				+                     expanding_test_size: bool = False,
			
 
				+                     data_set_size: Union[float, None] = None,
			
 
				+                     index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+            -> Union[Iterable[Tuple[List]], None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert((index is None) != (data_set_size is None)),\
			
 
				+                "Set index or data_set_size"
			
 
				+
			
 
				+            index = pd.Series(index) if (index is not None)\
			
 
				+                else pd.Series(range(data_set_size))
			
 
				+
			
 
				+            data_set_size = data_set_size or len(index)
			
 
				+
			
 
				+            start_train_size = int(start_train_proportion * data_set_size)
			
 
				+            step_size = int(step_proportion * data_set_size)
			
 
				+
			
 
				+            test_size = int(test_proportion * data_set_size)
			
 
				+
			
 
				+            train_inds_set = (list(range(train_size))
			
 
				+                              for train_size in
			
 
				+                              takewhile(
			
 
				+                                      lambda x: x <= data_set_size - test_size,
			
 
				+                                      accumulate(repeat(start_train_size),
			
 
				+                                                 lambda x, _: x + step_size)))
			
 
				+
			
 
				+            for train_inds in train_inds_set:
			
 
				+
			
 
				+                if expanding_test_size:
			
 
				+
			
 
				+                    yield (index[train_inds],
			
 
				+                           index[train_inds[-1] + 1:
			
 
				+                                 train_inds[-1] + 1
			
 
				+                                 + int(test_proportion*len(train_inds))])
			
 
				+
			
 
				+                else:
			
 
				+
			
 
				+                    yield (index[train_inds],
			
 
				+                           index[train_inds[-1] + 1:
			
 
				+                                 train_inds[-1] + 1 + test_size])
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._logger.log_and_raise_error(("Failed to make expanding cv. "
			
 
				+                                              "Exit with error: {}".format(e)))
			
 
				+
			
 
				+    def sliding_window_cv(
			
 
				+        self,
			
 
				+        test_proportion: float,
			
 
				+        train_proportion: float,
			
 
				+        step_proportion: float = None,
			
 
				+        data_set_size: Union[float, None] = None,
			
 
				+        index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+            -> Union[Iterable[Tuple[List]], None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert((index is None) != (data_set_size is None)),\
			
 
				+                "Set index or data_set_size"
			
 
				+
			
 
				+            index = pd.Series(index) if (index is not None)\
			
 
				+                else pd.Series(range(data_set_size))
			
 
				+
			
 
				+            data_set_size = data_set_size or len(index)
			
 
				+
			
 
				+            train_size = int(train_proportion * data_set_size)
			
 
				+            test_size = int(test_proportion * data_set_size)
			
 
				+            step_size = int(step_proportion * data_set_size)
			
 
				+
			
 
				+            train_sizes = takewhile(lambda x: x <= data_set_size - test_size,
			
 
				+                                    accumulate(repeat(train_size),
			
 
				+                                               lambda x, _: x + step_size))
			
 
				+
			
 
				+            train_starts = takewhile(lambda x: x <= data_set_size
			
 
				+                                     - train_size - test_size,
			
 
				+                                     accumulate(repeat(step_size),
			
 
				+                                                lambda x, _: x + step_size))
			
 
				+
			
 
				+            train_starts = chain([0], train_starts)
			
 
				+
			
 
				+            train_inds_set = list(range(train_start, train_size)
			
 
				+                                  for train_start, train_size in
			
 
				+                                  zip(train_starts, train_sizes))
			
 
				+
			
 
				+            cv = ((index[train_inds], index[train_inds[-1] + 1:
			
 
				+                                            train_inds[-1] + 1 + test_size])
			
 
				+                  for train_inds in train_inds_set)
			
 
				+
			
 
				+            return cv
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._logger.log_and_raise_error(
			
 
				+                    ("Failed to make sliding window cv. "
			
 
				+                     "Exit with error: {}".format(e)))
			
 
				+
			
--- a/cdplib/ml_validation/__init__.py
+++ b/cdplib/ml_validation/__init__.py
--- a/cdplib/ml_validation/cross_validate_with_fine_tuning.py
+++ b/cdplib/ml_validation/cross_validate_with_fine_tuning.py
@@ -0,0 +1,491 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Oct 29 13:58:23 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+
			
 
				+
			
 
				+@description:
			
 
				+
			
 
				+* Input:
			
 
				+    - pipeline/hyperparameter space
			
 
				+    - data_train
			
 
				+    - cv
			
 
				+    - cv_folds
			
 
				+
			
 
				+* For each pipeline:
			
 
				+
			
 
				+    -> Split data_train into folds according to cv
			
 
				+
			
 
				+     -> For each fold:
			
 
				+
			
 
				+         => get data_train_fold, data_test_fold, cv_fold
			
 
				+
			
 
				+         => split data_train_fold into subfolds according to cv_fold
			
 
				+
			
 
				+         => For each subfold:
			
 
				+
			
 
				+             ==> get data_train_subfold, data_test_subfold
			
 
				+
			
 
				+             ==> train pipeline on data_train_subfold
			
 
				+
			
 
				+             ==> find best_threshold_subfold on data_test_subfold
			
 
				+
			
 
				+        => Find averaged_threshold_fold averaged over best_threshold_subfold
			
 
				+
			
 
				+        => train pipeline on data_train_fold
			
 
				+
			
 
				+        => find score_fold on data_test_fold with proba_threshold_fold
			
 
				+
			
 
				+        => find best_threshold_fold on data_test_fold
			
 
				+
			
 
				+    -> find score averaged over score_fold
			
 
				+
			
 
				+    -> find averaged_threshold averaged over best_threshold_fold
			
 
				+
			
 
				+* choose (pipeline/hyperparameters, threshold) in the space with best score
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from itertools import zip_longest
			
 
				+from typing import Union, Callable, Dict, Iterable, Tuple, List
			
 
				+from copy import deepcopy
			
 
				+from itertools import accumulate, repeat, takewhile, chain
			
 
				+
			
 
				+from sklearn.model_selection import StratifiedKFold
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+aa = make_sliding_window_cv(data_set_size=50,
			
 
				+                            test_proportion=0.1,
			
 
				+                            train_proportion=0.6,
			
 
				+                            step_proportion=0.1)
			
 
				+
			
 
				+aa = list(aa)
			
 
				+
			
 
				+aa = make_sliding_window_cv(test_proportion=0.1,
			
 
				+                            train_proportion=0.6,
			
 
				+                            step_proportion=0.05,
			
 
				+                            index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
			
 
				+
			
 
				+aa = list(aa)
			
 
				+
			
 
				+
			
 
				+# TODO: write with yield !!!!
			
 
				+
			
 
				+def make_nested_expanding_cv(
			
 
				+        test_proportion: float,
			
 
				+        start_train_proportion: float,
			
 
				+        step_proportion: float = None,
			
 
				+        expanding_test_size: bool = False,
			
 
				+        data_set_size: Union[float, None] = None,
			
 
				+        index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+        -> Iterable[Tuple[List]]:
			
 
				+    """
			
 
				+    """
			
 
				+    logger = Log("make_nested_expanding_cv:")
			
 
				+
			
 
				+    try:
			
 
				+        cv = make_expanding_cv(test_proportion=test_proportion,
			
 
				+                               start_train_proportion=start_train_proportion,
			
 
				+                               step_proportion=step_proportion,
			
 
				+                               expanding_test_size=expanding_test_size,
			
 
				+                               data_set_size=data_set_size,
			
 
				+                               index=index)
			
 
				+
			
 
				+        nested_cv = []
			
 
				+
			
 
				+        for train_inds, test_inds in cv:
			
 
				+
			
 
				+            fold_index = train_inds if index is not None\
			
 
				+                else None
			
 
				+
			
 
				+            fold_size = len(train_inds) if index is None else None
			
 
				+
			
 
				+            fold_cv = make_expanding_cv(
			
 
				+                    test_proportion=test_proportion,
			
 
				+                    start_train_proportion=start_train_proportion,
			
 
				+                    step_proportion=step_proportion,
			
 
				+                    expanding_test_size=expanding_test_size,
			
 
				+                    data_set_size=fold_size,
			
 
				+                    index=fold_index)
			
 
				+
			
 
				+            nested_cv.append(list(fold_cv))
			
 
				+
			
 
				+        return nested_cv
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.log_and_raise_error(("Failed to make nested expanding cv. "
			
 
				+                                    "Exit with error: {}".format(e)))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+for train_inds, test_inds in aa:
			
 
				+    print(len(test_inds)/(len(train_inds) + len(test_inds)))
			
 
				+    print(len(test_inds)/50)
			
 
				+
			
 
				+aaa = list(aaa)
			
 
				+
			
 
				+for aaa_cv in aaa:
			
 
				+    for train_inds, test_inds in aaa_cv:
			
 
				+        print(len(test_inds)/(len(train_inds) + len(test_inds)))
			
 
				+        print(len(test_inds)/50)
			
 
				+
			
 
				+aaa = make_nested_expanding_cv(#data_set_size=50,
			
 
				+                               test_proportion=0.1,
			
 
				+                               start_train_proportion=0.6,
			
 
				+                               step_proportion=0.1,
			
 
				+                               index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
			
 
				+
			
 
				+aaa = list(aaa)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def cv_slice_dataset(X, y, train_inds, test_inds)\
			
 
				+        -> Tuple[Union[pd.DataFrame, np.ndarray],
			
 
				+                 Union[pd.Series, np.ndarray]]:
			
 
				+    """
			
 
				+    """
			
 
				+    if isinstance(X, pd.DataFrame):
			
 
				+        X_train = X.loc[train_inds]
			
 
				+        X_val = X.loc[test_inds]
			
 
				+    else:
			
 
				+        X_train = X[train_inds]
			
 
				+        X_val = X[test_inds]
			
 
				+
			
 
				+    if y is not None:
			
 
				+        y_train = y[train_inds]
			
 
				+        y_val = y[test_inds]
			
 
				+
			
 
				+    return X_train, X_val, y_train, y_val
			
 
				+
			
 
				+
			
 
				+def get_optimal_proba_threshold(score_func: Callable,
			
 
				+                                y_true: Union[pd.Series, np.ndarray],
			
 
				+                                proba: Union[pd.Series, np.ndarray],
			
 
				+                                threshold_set: Union[Iterable, None] = None):
			
 
				+    """
			
 
				+    """
			
 
				+    scores = {}
			
 
				+
			
 
				+    if threshold_set is None:
			
 
				+        threshold_set = np.arange(0, 1, 0.1)
			
 
				+
			
 
				+    for threshold in threshold_set:
			
 
				+
			
 
				+        y_pred = (proba >= threshold).astype(int)
			
 
				+
			
 
				+        scores[threshold] = score_func(y_true, y_pred)
			
 
				+
			
 
				+    return max(scores, key=scores.get)
			
 
				+
			
 
				+
			
 
				+def cross_validate_with_optimal_threshold(
			
 
				+        estimator: object,
			
 
				+        score_func: Callable,
			
 
				+        X_train: Union[pd.DataFrame, np.ndarray],
			
 
				+        y_train: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
			
 
				+        y_val: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
			
 
				+        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        cv: Union[Iterable, int, None] = None,
			
 
				+        cv_threshold: Union[Iterable, int, None] = None,
			
 
				+        additional_metrics: Union[Dict[str, Callable], None] = None,
			
 
				+        threshold_set: Union[Iterable, None] = None,
			
 
				+        scores: Dict = None)\
			
 
				+            -> Dict:
			
 
				+    """
			
 
				+    """
			
 
				+    logger = Log("cross_validate_with_optimal_threshold:")
			
 
				+
			
 
				+    X_train = deepcopy(X_train)
			
 
				+    y_train = deepcopy(y_train)
			
 
				+    X_val = deepcopy(X_val)
			
 
				+    y_val = deepcopy(y_val)
			
 
				+    X_val_threshold = deepcopy(X_val_threshold)
			
 
				+    y_val_threshold = deepcopy(y_val_threshold)
			
 
				+
			
 
				+    scores = scores or {"test_threshold": [],
			
 
				+                        "test_score": [],
			
 
				+                        "train_score": []}
			
 
				+
			
 
				+    additional_metrics = additional_metrics or {}
			
 
				+
			
 
				+    for metric_name, metric in additional_metrics.items():
			
 
				+        if "test_" + metric_name not in scores:
			
 
				+            scores["test_" + metric_name] = []
			
 
				+            scores["train_" + metric_name] = []
			
 
				+
			
 
				+    if cv is None:
			
 
				+
			
 
				+        # test score is calculated on X_vals
			
 
				+
			
 
				+        assert((X_val is not None) and (y_val is not None)),\
			
 
				+            "Validation set must be set"
			
 
				+
			
 
				+        if cv_threshold is None:
			
 
				+
			
 
				+            refit = (X_val_threshold is not None)
			
 
				+
			
 
				+            # if a validation set for proba threshold tuning is not given,
			
 
				+            # we use the validation set on which we calculate the test score
			
 
				+            # (this might lead to overfitting)
			
 
				+
			
 
				+            X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
			
 
				+            y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
			
 
				+
			
 
				+            cv_threshold, X_train, y_train = make_dummy_cv(
			
 
				+                    X_train=X_train,
			
 
				+                    y_train=y_train,
			
 
				+                    X_val=X_val_threshold,
			
 
				+                    y_val=y_val_threshold)
			
 
				+        else:
			
 
				+
			
 
				+            # if cv_threshold is given, we find the optimal threshold
			
 
				+            # on each fold and output the average value for the threshold
			
 
				+
			
 
				+            if (X_val_threshold is not None):
			
 
				+                logger.log_and_throw_warning((
			
 
				+                        "X_val_threshold is set "
			
 
				+                        "but cv_threshold will be used"))
			
 
				+
			
 
				+            if isinstance(cv_threshold, int):
			
 
				+                cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
			
 
				+                    .split(X=X_train, y=y_train)
			
 
				+
			
 
				+            refit = True
			
 
				+
			
 
				+        thresholds = []
			
 
				+
			
 
				+        for train_inds, val_inds in cv_threshold:
			
 
				+
			
 
				+            print("----- In cv threshold fold")
			
 
				+
			
 
				+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
			
 
				+                cv_slice_dataset(X=X_train,
			
 
				+                                 y=y_train,
			
 
				+                                 train_inds=train_inds,
			
 
				+                                 test_inds=val_inds)
			
 
				+
			
 
				+            estimator.fit(X_train_fold, y_train_fold)
			
 
				+
			
 
				+            proba_val = estimator.predict_proba(X_val_fold)[:, 1]
			
 
				+
			
 
				+            threshold = get_optimal_proba_threshold(score_func=score_func,
			
 
				+                                                    y_true=y_val_fold,
			
 
				+                                                    proba=proba_val)
			
 
				+
			
 
				+            thresholds.append(threshold)
			
 
				+
			
 
				+            print("----- Threshold:", threshold)
			
 
				+
			
 
				+        scores["test_threshold"].append(np.mean(thresholds))
			
 
				+
			
 
				+        if refit:
			
 
				+
			
 
				+            estimator.fit(X_train, y_train)
			
 
				+
			
 
				+            proba_val = estimator.predict_proba(X_val)[:, 1]
			
 
				+
			
 
				+        proba_train = estimator.predict_proba(X_train)[:, 1]
			
 
				+
			
 
				+        pred_train = (proba_train >= threshold)
			
 
				+        pred_val = (proba_val >= threshold)
			
 
				+
			
 
				+        train_score = score_func(y_train, pred_train)
			
 
				+        test_score = score_func(y_val, pred_val)
			
 
				+
			
 
				+        for metric_name, metric in additional_metrics.items():
			
 
				+            scores["train_" + metric_name].append(metric(y_train, pred_train))
			
 
				+            scores["test_" + metric_name].append(metric(y_val, pred_val))
			
 
				+
			
 
				+        scores["train_score"].append(train_score)
			
 
				+        scores["test_score"].append(test_score)
			
 
				+
			
 
				+        return scores
			
 
				+
			
 
				+    else:
			
 
				+
			
 
				+        if isinstance(cv, int):
			
 
				+            cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
			
 
				+
			
 
				+        cv_threshold = cv_threshold or []
			
 
				+
			
 
				+        for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
			
 
				+
			
 
				+            print("=== In cv fold")
			
 
				+
			
 
				+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
			
 
				+                cv_slice_dataset(X=X_train,
			
 
				+                                 y=y_train,
			
 
				+                                 train_inds=train_inds,
			
 
				+                                 test_inds=val_inds)
			
 
				+
			
 
				+            scores = cross_validate_with_optimal_threshold(
			
 
				+                    estimator=estimator,
			
 
				+                    score_func=score_func,
			
 
				+                    X_train=X_train_fold,
			
 
				+                    y_train=y_train_fold,
			
 
				+                    X_val=X_val_fold,
			
 
				+                    y_val=y_val_fold,
			
 
				+                    cv_threshold=cv_fold,
			
 
				+                    additional_metrics=additional_metrics,
			
 
				+                    threshold_set=threshold_set,
			
 
				+                    scores=scores)
			
 
				+
			
 
				+            print("=== scores:", scores)
			
 
				+
			
 
				+        return scores
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    from sklearn.metrics import accuracy_score, precision_score
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from xgboost import XGBRFClassifier
			
 
				+    from sklearn.model_selection import train_test_split
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    X_train, X_val, y_train, y_val = train_test_split(X, y)
			
 
				+
			
 
				+    estimator = XGBRFClassifier()
			
 
				+
			
 
				+    score_func = accuracy_score
			
 
				+
			
 
				+    additional_metrics = {"precision": precision_score}
			
 
				+
			
 
				+    averaged_scores = []
			
 
				+    averaged_thresholds = []
			
 
				+
			
 
				+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            estimator=estimator,
			
 
				+            score_func=accuracy_score,
			
 
				+            X_train=X_train,
			
 
				+            y_train=y_train,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=None,
			
 
				+            y_val_threshold=None,
			
 
				+            cv=None,
			
 
				+            cv_threshold=None,
			
 
				+            additional_metrics=additional_metrics)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    X_train, X_val_threshold, y_train, y_val_threshold =\
			
 
				+        train_test_split(X_train, y_train)
			
 
				+
			
 
				+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            estimator=estimator,
			
 
				+            score_func=accuracy_score,
			
 
				+            X_train=X_train,
			
 
				+            y_train=y_train,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv=None,
			
 
				+            cv_threshold=None,
			
 
				+            additional_metrics=additional_metrics)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    print("\nTesting cv=None, cv_threshold=3 \n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            estimator=estimator,
			
 
				+            score_func=accuracy_score,
			
 
				+            X_train=X_train,
			
 
				+            y_train=y_train,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv=None,
			
 
				+            cv_threshold=3,
			
 
				+            additional_metrics=additional_metrics)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    print("\nTesting cv=3, cv_threshold=None \n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            estimator=estimator,
			
 
				+            score_func=accuracy_score,
			
 
				+            X_train=X_train,
			
 
				+            y_train=y_train,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv=3,
			
 
				+            cv_threshold=None,
			
 
				+            additional_metrics=additional_metrics)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            estimator=estimator,
			
 
				+            score_func=accuracy_score,
			
 
				+            X_train=X_train,
			
 
				+            y_train=y_train,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv=3,
			
 
				+            cv_threshold=[3, 3, 3],
			
 
				+            additional_metrics=additional_metrics)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    # TODO: check overwriting X_train,
			
 
				+    # additional metrics append instead of overwrite
			
 
				+    # check the length of cv_threshold
			
 
				+    # test custom cv, cv_threshold
			
 
				+
			
 
				+    print("\n Averaged test score:", averaged_scores)
			
 
				+    print("\n Averaged threshold:", averaged_thresholds)
			
--- a/cdplib/ml_validation/expanding_cv.py
+++ b/cdplib/ml_validation/expanding_cv.py
@@ -0,0 +1,97 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Dec  9 09:55:52 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from typing import Union, Iterable, Tuple, List
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from itertools import accumulate, repeat, takewhile
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+def make_expanding_cv(test_proportion: float,
			
 
				+                      start_train_proportion: float,
			
 
				+                      step_proportion: float = None,
			
 
				+                      expanding_test_size: bool = False,
			
 
				+                      data_set_size: Union[float, None] = None,
			
 
				+                      index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+        -> Union[Iterable[Tuple[List]], None]:
			
 
				+    """
			
 
				+
			
 
				+    """
			
 
				+    logger = Log("make_expanding_cv:")
			
 
				+
			
 
				+    try:
			
 
				+        assert((index is None) != (data_set_size is None)),\
			
 
				+            "Set index or data_set_size"
			
 
				+
			
 
				+        index = index if (index is not None)\
			
 
				+            else pd.Series(range(data_set_size))
			
 
				+
			
 
				+        data_set_size = data_set_size or len(index)
			
 
				+
			
 
				+        start_train_size = int(start_train_proportion * data_set_size)
			
 
				+        step_size = int(step_proportion * data_set_size)
			
 
				+
			
 
				+        test_size = int(test_proportion * data_set_size)
			
 
				+
			
 
				+        train_inds_set = (list(range(train_size))
			
 
				+                          for train_size in
			
 
				+                          takewhile(
			
 
				+                                  lambda x: x <= data_set_size - test_size,
			
 
				+                                  accumulate(repeat(start_train_size),
			
 
				+                                             lambda x, _: x + step_size)))
			
 
				+
			
 
				+        for train_inds in train_inds_set:
			
 
				+
			
 
				+            if expanding_test_size:
			
 
				+
			
 
				+                yield (index[train_inds],
			
 
				+                       index[train_inds[-1] + 1:
			
 
				+                             train_inds[-1] + 1
			
 
				+                             + int(test_proportion*len(train_inds))])
			
 
				+
			
 
				+            else:
			
 
				+
			
 
				+                yield (index[train_inds],
			
 
				+                       index[train_inds[-1] + 1:
			
 
				+                             train_inds[-1] + 1 + test_size])
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.log_and_raise_error(("Failed to make expanding cv. "
			
 
				+                                    "Exit with error: {}".format(e)))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    logger = Log("Test_expanding_cv: ")
			
 
				+
			
 
				+    logger.info("Start Testing")
			
 
				+
			
 
				+    logger.info("Testing expanding cv: ")
			
 
				+
			
 
				+    cv = make_expanding_cv(data_set_size=50,
			
 
				+                           test_proportion=0.1,
			
 
				+                           start_train_proportion=0.6,
			
 
				+                           step_proportion=0.1,
			
 
				+                           expanding_test_size=True)
			
 
				+
			
 
				+    cv = list(cv)
			
 
				+
			
 
				+    logger.info("Testing expanding cv with datetime index")
			
 
				+
			
 
				+    cv = make_expanding_cv(
			
 
				+            test_proportion=0.1,
			
 
				+            start_train_proportion=0.6,
			
 
				+            step_proportion=0.1,
			
 
				+            index=pd.date_range(start=pd.to_datetime("2020-01-01"),
			
 
				+                                periods=50))
			
 
				+
			
 
				+    cv = list(cv)
			
 
				+
			
 
				+    logger.info("Finish testing")
			
--- a/cdplib/pipeline_selector/PipelineSelector.py
+++ b/cdplib/pipeline_selector/PipelineSelector.py
@@ -0,0 +1,789 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 14:23:23 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: an abstract class for selecting a machine learning
			
 
				+ pipeline from a space (deterministic or random) of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is thought in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far
			
 
				+ as well as the entire tuning history if needed.
			
 
				+ Methods configure_cross_validation and configure_result_saving
			
 
				+ allow to use a custom cross-validation method and
			
 
				+ save the current best result in a file or database during training.
			
 
				+ Children classes: hyperopt and custom gridsearch.
			
 
				+"""
			
 
				+
			
 
				+import pickle
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from copy import deepcopy
			
 
				+from abc import ABC, abstractmethod, abstractproperty
			
 
				+from typing import Callable, Optional, TypedDict,\
			
 
				+    Literal, Dict, Iterable, List, Tuple, Union
			
 
				+import functools
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.model_selection import cross_validate as sklearn_cross_validation
			
 
				+from sklearn.metrics import make_scorer
			
 
				+from hyperopt import STATUS_OK, STATUS_FAIL
			
 
				+from cdplib.log import Log
			
 
				+from cdplib.utils import ExceptionsHandler
			
 
				+from cdplib.utils import LoadingUtils
			
 
				+from cdplib.ml_validation import CVComposer
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class SpaceElementType(TypedDict):
			
 
				+    name: str
			
 
				+    pipeline: Pipeline
			
 
				+    params: dict
			
 
				+
			
 
				+
			
 
				+class PipelineSelector(ABC):
			
 
				+    """
			
 
				+    An abstract class for selecting a machine learning
			
 
				+    pipeline from a space (deterministic or random) of parameter
			
 
				+    distributions over multiple pipelines.
			
 
				+    The selection is though in such a way that a Trials object is being
			
 
				+    maintained during the tuning process from which one can retrieve
			
 
				+    the best pipeline so far as well as the entire tuning history
			
 
				+    if needed.
			
 
				+    Methods configure_cross_validation and configure_result_saving
			
 
				+    allow to use a custom cross-validation method and
			
 
				+    save the current best result in a file or database during training.
			
 
				+    Children classes: hyperopt and custom gridsearch.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func: Union[Callable, str],
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: Optional[int] = None,
			
 
				+                 cross_val_averaging_func: Callable = np.mean,
			
 
				+                 additional_metrics: Optional[Dict[str, Callable]] = None,
			
 
				+                 additional_averaging_funcs:
			
 
				+                     Optional[Dict[str, Callable]] = None,
			
 
				+                 strategy_name: Optional[str] = None,
			
 
				+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
			
 
				+                 = "INFO"):
			
 
				+        """
			
 
				+        :param Callable cost_func: function to minimize or maximize
			
 
				+            over the elements of a given (pipeline/hyperparameter) space
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from scratch.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				+
			
 
				+        :param Callable cross_val_averaging_func: Function to aggregate
			
 
				+            the cross-validation scores of the cost_func.
			
 
				+            Example different from the mean: mean - c*var.
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to keep track of
			
 
				+            in the trials of the form {"metric_name": metric}.
			
 
				+
			
 
				+        :param additional_averaging_funcs: functions used to aggregate
			
 
				+            the output of the cross_validate function.
			
 
				+            The output always contains the scores of the cost_func,
			
 
				+            additional_metrics (if it is not empty),
			
 
				+            but it can also contain additional information
			
 
				+            (like probability threshold for example)
			
 
				+            if different from cross_val_averaging_func.
			
 
				+            Of the form {"metric_name": averaging_func}
			
 
				+
			
 
				+            Remark:
			
 
				+
			
 
				+        :param str strategy_name:
			
 
				+            a strategy is defined by the data set (columns/features and rows),
			
 
				+            cv object, cost function.
			
 
				+            When the strategy changes, one must start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+        self._logger = Log("PipelineSelector: ",
			
 
				+                           stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            ExceptionsHandler(self._logger)\
			
 
				+                .assert_is_directory(path=trials_path)
			
 
				+
			
 
				+            self.attached_space = False
			
 
				+            self.attached_data = False
			
 
				+            self.configured_cross_validation = False
			
 
				+            self.configured_summary_saving = False
			
 
				+
			
 
				+            self._cost_func = cost_func
			
 
				+            # score factor is 1 when cost_func is minimized,
			
 
				+            # -1 when cost func is maximized
			
 
				+            self._score_factor = (not greater_is_better) - greater_is_better
			
 
				+            self.trials_path = trials_path
			
 
				+            self._backup_trials_freq = backup_trials_freq
			
 
				+            self._strategy_name = strategy_name
			
 
				+            self._data_path = None
			
 
				+            self._cv_path = None
			
 
				+
			
 
				+            self._X = None
			
 
				+            self._y = None
			
 
				+            self._cv = None
			
 
				+            self._space = None
			
 
				+
			
 
				+            # if cross-valition is not configured,
			
 
				+            # sklearn cross-validation method is taken by default
			
 
				+            self._cross_validation = sklearn_cross_validation
			
 
				+
			
 
				+            # if a trials object already exists at the given path,
			
 
				+            # it is loaded and the search is continued. Else,
			
 
				+            # the search is started from the beginning.
			
 
				+            if os.path.isfile(self.trials_path):
			
 
				+
			
 
				+                with open(self.trials_path, "rb") as f:
			
 
				+                    self._trials = pickle.load(f)
			
 
				+
			
 
				+                self._start_iteration = self.number_of_trials
			
 
				+
			
 
				+                self.best_score = self.best_trial_score
			
 
				+
			
 
				+                self._logger.info(("Loaded an existing trials object"
			
 
				+                                   "Consisting of {} trials")
			
 
				+                                  .format(self._start_iteration))
			
 
				+
			
 
				+            else:
			
 
				+                self._logger.warning(("No existing trials object was found, "
			
 
				+                                      "Starting from scratch."))
			
 
				+
			
 
				+                self._trials = None
			
 
				+                self._start_iteration = 0
			
 
				+                self.best_score = np.nan
			
 
				+
			
 
				+            # keeping track of the current search iteration
			
 
				+            self._iteration = self._start_iteration
			
 
				+            self._score_improved = False
			
 
				+
			
 
				+            self.start_tuning_time = datetime.datetime.today()
			
 
				+            self.total_tuning_time = None
			
 
				+            self.finished_tuning = False
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to initialize the class. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _backup_trials(self) -> None:
			
 
				+        '''
			
 
				+        Pickles (Saves) the trials object in binary format.
			
 
				+        '''
			
 
				+        try:
			
 
				+            with open(self.trials_path, "wb") as f:
			
 
				+                pickle.dump(self._trials, f)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Could not backup trials. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def configure_cross_validation(self,
			
 
				+                                   cross_validation: Callable,
			
 
				+                                   kwargs: dict = None) -> None:
			
 
				+        """
			
 
				+        Method for attaching a custom cross-validation function
			
 
				+
			
 
				+        :param cross_validation: a function that has the same
			
 
				+             signature as sklearn.model_selection.cross_validate
			
 
				+        """
			
 
				+        try:
			
 
				+            kwargs = kwargs or {}
			
 
				+
			
 
				+            self._cross_validation = functools.partial(
			
 
				+                    self._cross_validation, **kwargs)
			
 
				+
			
 
				+            self.configured_cross_validation = True
			
 
				+
			
 
				+            self._logger.info("Configured cross validation")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to configure cross-validation. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def configure_cross_validation_from_module(self,
			
 
				+                                               module_path: str,
			
 
				+                                               name: str) -> None:
			
 
				+        """
			
 
				+        Attaches a cross-validation funciton defined in
			
 
				+        a different python model. This function must have
			
 
				+        the same signature as sklearn.model_seclection.cross_validate
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the cross_validation function is defined.
			
 
				+
			
 
				+        :param str name: name of the cross validation function
			
 
				+            loaded froma python module.
			
 
				+        """
			
 
				+        try:
			
 
				+            self._cross_validation = \
			
 
				+                LoadingUtils().load_from_module(
			
 
				+                        module_path=module_path, name=name)
			
 
				+
			
 
				+            self.configured_cross_validation = True
			
 
				+
			
 
				+            self._logger.info("Configured cross validation")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to load cross-validation from module. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def attach_space(self, space) -> None:
			
 
				+        """
			
 
				+        Method for attaching the pipeline/hyperparameter space
			
 
				+        over which the score_func is optimized.
			
 
				+
			
 
				+        :param space: space where
			
 
				+            the search is performed. A space might be either
			
 
				+            a list of dictionaries or a hyperopt space object
			
 
				+            the elements of which are dictionaries with keys:
			
 
				+            name, pipeline, params
			
 
				+        """
			
 
				+        try:
			
 
				+            self._space = space
			
 
				+
			
 
				+            self.attached_space = True
			
 
				+
			
 
				+            self._logger.info("Attached parameter distribution space")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach space. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def attach_space_from_module(self, module_path: str, name: str) -> None:
			
 
				+        """
			
 
				+        Attaches a space defined in a different python module.
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the space is defined.
			
 
				+
			
 
				+        :param str name: name of the space loaded from
			
 
				+            a python module.
			
 
				+        """
			
 
				+        try:
			
 
				+            self._space = LoadingUtils().load_from_module(
			
 
				+                    module_path=module_path, name=name)
			
 
				+
			
 
				+            self.attached_space = True
			
 
				+
			
 
				+            self._logger.info("Attached parameter distribution space")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach space from module. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.loger_and_raise_error(err)
			
 
				+
			
 
				+    def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
			
 
				+                    y_train: Optional[pd.DataFrame, pd.Series, np.ndarray]
			
 
				+                    = None,
			
 
				+                    X_val: Optional[pd.DataFrame, np.ndarray]
			
 
				+                    = None,
			
 
				+                    y_val: Optional[pd.DataFrame, pd.Series, np.ndarray]
			
 
				+                    = None,
			
 
				+                    cv: Optional[Iterable[Tuple[List[int], List[int]]]]
			
 
				+                    = None) -> None:
			
 
				+        '''
			
 
				+        :param array X_train: data on which
			
 
				+            machine learning pipelines are trained
			
 
				+
			
 
				+        :param array y_train: optional, vector with targets,
			
 
				+            (None in case of unsupervided learning)
			
 
				+
			
 
				+        :param array X_val: optional, validation data.
			
 
				+            When not provided, cross-validated value
			
 
				+            of the cost_func is calculated.
			
 
				+
			
 
				+        :param array y_val: optional, validation targets
			
 
				+
			
 
				+        :param list cv: iterabe of tuples containing
			
 
				+            train and validation indices or an integer representing
			
 
				+            the number of folds for a random split of data
			
 
				+            during cross-validation
			
 
				+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert((cv is None) == (X_val is not None)),\
			
 
				+                "Either cv or X_val must be provided"
			
 
				+
			
 
				+            if cv is None:
			
 
				+
			
 
				+                assert((y_val is None) == (y_train is None)),\
			
 
				+                    "y_train and y_val must be simultanious"
			
 
				+
			
 
				+                # Here we create a trivial cv object
			
 
				+                # with one validation split.
			
 
				+                cv = CVComposer.dummy_cv()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+                train_inds = list(range(len(X_train)))
			
 
				+                val_inds = list(range(len(X_train),
			
 
				+                                      len(X_train) + len(X_val)))
			
 
				+
			
 
				+                self._cv = [(train_inds, val_inds)]
			
 
				+
			
 
				+                self._X = np.concatenate([X_train, X_val])
			
 
				+                self._y = None if y_train is None\
			
 
				+                    else np.concatenate([y_train, y_val])
			
 
				+
			
 
				+            else:
			
 
				+
			
 
				+                self._cv = cv
			
 
				+                self._X = X_train
			
 
				+                self._y = y_train
			
 
				+
			
 
				+            self.attached_data = True
			
 
				+
			
 
				+            self._logger.info("Attached data")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach data. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def attach_data_from_hdf5(self,
			
 
				+                              data_hdf5_store_path: str,
			
 
				+                              cv_pickle_path: str = None) -> None:
			
 
				+        """
			
 
				+        Method for attaching data from a hdf5 store
			
 
				+         and a cv object from a pickled file.
			
 
				+
			
 
				+         The hdf5 store is a binary file,
			
 
				+         after loading it, it is a dictionary with keys
			
 
				+         X_train (y_train, X_val, y_val).
			
 
				+
			
 
				+         The cv is loaded from a pickle file.
			
 
				+
			
 
				+         The reason to separate the data
			
 
				+         store from the cv store, is the hdf5 is optimized to
			
 
				+         store large dataframes (especially with simple types) and
			
 
				+         a a small list of lists like a cv-object is better
			
 
				+         to be stored as a pickle file.
			
 
				+
			
 
				+        :param str data_hdf5_store_path: path to the hdf5 store
			
 
				+            with train and validation data
			
 
				+        :param str cv_pickle_path: path to the pickle file with
			
 
				+            the cv data
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(os.path.isfile(data_hdf5_store_path)),\
			
 
				+                "Parameter hdf5_store_path is not a file"
			
 
				+
			
 
				+            store = pd.HDFStore(data_hdf5_store_path)
			
 
				+
			
 
				+            self._data_path = data_hdf5_store_path
			
 
				+
			
 
				+            data_input = {key: store["key"] if key in store else None
			
 
				+                          for key in ["X_train", "y_train", "X_val", "y_val"]}
			
 
				+
			
 
				+            if cv_pickle_path is not None:
			
 
				+
			
 
				+                assert(os.path.isfile(cv_pickle_path)),\
			
 
				+                    "Parameter cv_pickle_path is not a file"
			
 
				+
			
 
				+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
			
 
				+
			
 
				+                self._cv_path = cv_pickle_path
			
 
				+
			
 
				+            else:
			
 
				+                data_input["cv"] = None
			
 
				+
			
 
				+            self.attach_data(**data_input)
			
 
				+
			
 
				+            store.close()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to attach data. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def default_summary(self) -> dict:
			
 
				+        """
			
 
				+        Default summary of the strategy.
			
 
				+        Every the _objective function is called
			
 
				+        the current score and the information
			
 
				+        about the tested space element is added to the
			
 
				+        summary and it is saved to the Trials.
			
 
				+        If summary saving is configured it is also
			
 
				+        saved to a file, or a database when the score improves.
			
 
				+        """
			
 
				+        summary = {}
			
 
				+
			
 
				+        if self._strategy_name is not None:
			
 
				+            summary["strategy_name"] = self._strategy_name
			
 
				+
			
 
				+        if isinstance(self._cost_func, str):
			
 
				+            summary["cost_func"] = self._cost_func
			
 
				+
			
 
				+        elif hasattr(self._cost_func, "__name__"):
			
 
				+            summary["cost_func"] = self._cost_func.__name__
			
 
				+
			
 
				+        summary["trials_path"] = self.trials_path
			
 
				+
			
 
				+        if self._data_path is not None:
			
 
				+            summary["data_path"] = self._data_path
			
 
				+
			
 
				+        if self._cv_path is not None:
			
 
				+            summary["cv_path"] = self._cv_path
			
 
				+
			
 
				+        summary["start_tuning_time"] = self.start_tuning_time
			
 
				+
			
 
				+        summary["iteration"] = self._iteration
			
 
				+
			
 
				+        return summary
			
 
				+
			
 
				+    def configer_summary_saving(self,
			
 
				+                                save_method: Callable
			
 
				+                                = functools.partial(
			
 
				+                                        pd.DataFrame.to_excel,
			
 
				+                                        **{"path_or_buf": "result.csv"}),
			
 
				+                                kwargs: Optional[dict] = None) -> None:
			
 
				+        """
			
 
				+        When the score calculated by _objective function improves,
			
 
				+        the default summary is updated with information about the
			
 
				+        current score and pipeline/hyperparameters
			
 
				+        and can be saved to a file or database, depending
			
 
				+        on the configured save_method.
			
 
				+
			
 
				+        :param Callable save_method: method for saving the result
			
 
				+            of the pipeline selection. The method must accept
			
 
				+            a pandas DataFrame as argument.
			
 
				+            By default, saving to an excel file.
			
 
				+
			
 
				+            Examples:
			
 
				+                functools.partial(pd.DataFrame.to_csv,
			
 
				+                                  **{"path_or_buf": <PATH>})
			
 
				+                functools.partial(np.savetxt, **{"fname": <PATH>})
			
 
				+
			
 
				+                functools.partial(SQLHandler(<URI>).append_to_table,
			
 
				+                                  **{"tablename": <NAME>})
			
 
				+
			
 
				+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
			
 
				+                                  **{"collection_name": <NAME>})
			
 
				+
			
 
				+            using functools can be avoided by providing the kwarg argument
			
 
				+
			
 
				+        :param dict kwargs: a dictionary with keyword arguments
			
 
				+            (like tablename) to provide to the save_method
			
 
				+        """
			
 
				+        try:
			
 
				+            kwargs = kwargs or {}
			
 
				+
			
 
				+            self._save_method = functools.partial(save_method, **kwargs)
			
 
				+
			
 
				+            self.configured_summary_saving = True
			
 
				+
			
 
				+            self._logger.info("Configured summary saving")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to configure the summary saving. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _save_summary(self, summary: dict) -> None:
			
 
				+        """
			
 
				+        When the score calculated by _objective function improves,
			
 
				+        the default summary is updated with information about the
			
 
				+        current score and pipeline/hyperparameters
			
 
				+        and can be saved to a file or database, depending
			
 
				+        on the configured save_method.
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self.configured_summary_saving),\
			
 
				+                "Result saving must be configured first"
			
 
				+
			
 
				+            self._save_method(summary)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not configure summary saving. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
			
 
				+        """
			
 
				+        Calculates the averaged cross-validated score and score variance,
			
 
				+        as well as the averaged values and variances of the additional metrics.
			
 
				+
			
 
				+        This method is called in the _objective function that is
			
 
				+        passed to the hyperopt optimizer.
			
 
				+
			
 
				+        This function can be overriden, when the cost
			
 
				+        needs to be calculated differently,
			
 
				+        for example with a tensorflow model.
			
 
				+
			
 
				+        :param Pipeline pipeline: machine learning pipeline
			
 
				+            that will be evaluated with cross-validation
			
 
				+
			
 
				+        :return: dictionary with the aggregated
			
 
				+            cross-validation scores and
			
 
				+            the score variances for the scores in the output
			
 
				+            of the cross-validation function.
			
 
				+
			
 
				+            form of the output:
			
 
				+                {"score": 10, #score used in optimization,
			
 
				+                 "score_variance": 0.5
			
 
				+                 "additional_metric1": 5,
			
 
				+                 "additional_metric1_variance": 7}
			
 
				+
			
 
				+            a custom cross-validation function can also include for
			
 
				+            example probability threshold for each fold, then
			
 
				+            the output of this function will include the average
			
 
				+            value and the variance of the probability threshold
			
 
				+            over the folds.
			
 
				+        """
			
 
				+        try:
			
 
				+            scoring = {"score": make_scorer(self.cost_func)}
			
 
				+
			
 
				+            scoring.update({metric_name: make_scorer(metric)
			
 
				+                            for metric_name, metric
			
 
				+                            in self._additional_metrics.items()})
			
 
				+
			
 
				+            scores = self._cross_validation(
			
 
				+                    estimator=pipeline,
			
 
				+                    X=self._X,
			
 
				+                    y=self._y,
			
 
				+                    cv=self._cv,
			
 
				+                    scoring=self._scoring,
			
 
				+                    error_score=np.nan)
			
 
				+
			
 
				+            averaging_funcs = {
			
 
				+                    metric_name: self._additional_averaging_funcs[metric_name]
			
 
				+                    if metric_name in self._additional_averaging_funcs
			
 
				+                    else self._cross_val_averaging_func
			
 
				+                    for metric_name in scores}
			
 
				+
			
 
				+            scores_average = {
			
 
				+                    metric_name.replace("test_", ""):
			
 
				+                    averaging_funcs[metric_name](scores[metric_name])
			
 
				+                    for metric_name in scores
			
 
				+                    if metric_name.startswith("test")}
			
 
				+
			
 
				+            scores_variance = {
			
 
				+                    metric_name.replace("test_", "") + "_variance":
			
 
				+                    np.var(scores[metric_name])
			
 
				+                    for metric_name in scores
			
 
				+                    if metric_name.startswith("test")}
			
 
				+
			
 
				+            return {**scores_average, **scores_variance}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _objective(self, space_element: SpaceElementType) -> dict:
			
 
				+        '''
			
 
				+        This method is called in run_trials method
			
 
				+        that is using the hyperopt fmin opmizer.
			
 
				+
			
 
				+        Uses _evaluate method.
			
 
				+
			
 
				+        It must take as input a space element
			
 
				+        and produce an output in the form of dictionary
			
 
				+        with 2 obligatory values loss and status
			
 
				+        (STATUS_OK or STATUS_FAIL). Other
			
 
				+        values in the output are optional and can be
			
 
				+        accessed later through the trials object.
			
 
				+
			
 
				+        :Warning: fmin minimizes the loss,
			
 
				+        when _evaluate returns a value to be maximized,
			
 
				+        it is multiplied by -1 to obtain loss.
			
 
				+
			
 
				+        :param SpaceElementType space_element: element
			
 
				+            of the space over which the optimization is done
			
 
				+
			
 
				+        :output: dictionary with keys
			
 
				+            loss (minimized value),
			
 
				+            status with values STATUS_OK or STATUS_FAIL
			
 
				+            uderstood by hyperopt,
			
 
				+            score (equal to loss or -loss),
			
 
				+            score_variance,
			
 
				+            timestamp (end of execution),
			
 
				+            train_time: execution time
			
 
				+            and other keys given in self.default_summary
			
 
				+        '''
			
 
				+        try:
			
 
				+            start_time = time.time()
			
 
				+
			
 
				+            assert(self.attached_data),\
			
 
				+                ("Data must be attached in order "
			
 
				+                 "in order to effectuate the best"
			
 
				+                 "pipeline search")
			
 
				+
			
 
				+            summary = deepcopy(self.default_summary)
			
 
				+
			
 
				+            # backup the current trials if the score improved
			
 
				+            # at previous iteration or every ith iteration
			
 
				+            # if the backup_trials_freq is set
			
 
				+            backup_cond = ((self._backup_trials_freq is not None) and
			
 
				+                           ((self._iteration - self._start_iteration - 1) %
			
 
				+                            self._backup_trials_freq == 0)) or\
			
 
				+                self._score_improved
			
 
				+
			
 
				+            if backup_cond:
			
 
				+                self._backup_trials()
			
 
				+                self._score_improved = False
			
 
				+
			
 
				+            pipeline = space_element['pipeline']
			
 
				+            params = space_element['params']
			
 
				+            pipeline.set_params(**params)
			
 
				+
			
 
				+            self._logger.info(("Iteration {0}: "
			
 
				+                               "Current score is {1}: "
			
 
				+                               "Training pipeline {2} "
			
 
				+                               "with parameters: {3}. ").format(
			
 
				+                                  self._iteration,
			
 
				+                                  self.best_score,
			
 
				+                                  space_element['name'],
			
 
				+                                  params))
			
 
				+
			
 
				+            result = self._evaluate(pipeline)
			
 
				+
			
 
				+            summary.update(result)
			
 
				+
			
 
				+            end_time = time.time()
			
 
				+
			
 
				+            summary['status'] = STATUS_OK
			
 
				+            summary.update(result)
			
 
				+            summary['loss'] = self._score_factor * summary['score']
			
 
				+            summary['timestamp'] = datetime.datetime.today()
			
 
				+            summary['train_time'] = end_time - start_time
			
 
				+
			
 
				+            self._iteration += 1
			
 
				+
			
 
				+            self._score_improved = (self.best_score != self.best_score) or\
			
 
				+                                   (self._score_factor*result["score"] <
			
 
				+                                    self._score_factor*self.best_score)
			
 
				+
			
 
				+            if self._score_improved:
			
 
				+
			
 
				+                self._logger.info("Score improved, new best score is: {}"
			
 
				+                                  .format(result["score"]))
			
 
				+
			
 
				+                self.best_score = result['score']
			
 
				+
			
 
				+                if self.configured_summary_saving:
			
 
				+                    self._save_summary(summary)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+
			
 
				+            self._logger.warning("Trial failed with error {}".format(e))
			
 
				+
			
 
				+            summary = {}
			
 
				+            summary['status'] = STATUS_FAIL
			
 
				+            summary['timestamp'] = datetime.datetime.today()
			
 
				+            summary['error'] = e
			
 
				+            for key in ['loss', 'score', 'score_variance', 'train_time']:
			
 
				+                summary[key] = np.nan
			
 
				+
			
 
				+        return summary
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def run_trials(self):
			
 
				+        """
			
 
				+        Method that runs the hyperparameter tuning over possibly multiple
			
 
				+        pipeline types specified in self.space
			
 
				+        When run_trials method is finished the flag self.finished_tuning
			
 
				+        should be set to True and the methods self._backup_trials and
			
 
				+        optionally self._save_result should be called.
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def number_of_trials(self) -> int:
			
 
				+        """
			
 
				+        Number of trials already run in the current trials object
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial(self) -> dict:
			
 
				+        """
			
 
				+        Best trial sor far.
			
 
				+         Should contain the status, pipeline,
			
 
				+         hyperparameters, and the score (loss).
			
 
				+         Other information is otional and is defined
			
 
				+         by self.default_summary
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial_score(self) -> float:
			
 
				+        """
			
 
				+        Score of the best pipeline with the best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial_score_variance(self) -> float:
			
 
				+        """
			
 
				+        Variance of the cross-validation score of the best pipeline
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial_pipeline(self) -> Pipeline:
			
 
				+        """
			
 
				+        Best pipeline with best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_n_best_trial_pipelines(self, n: int) -> list:
			
 
				+        """
			
 
				+        N best pipelines with corresponding
			
 
				+        best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
			
 
				+        """
			
 
				+        If the hyperparameter search is done over multiple
			
 
				+        pipelines, then returns n different pipeline-types
			
 
				+        with corresponding hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def trials_to_excel(self, path: str) -> None:
			
 
				+        """
			
 
				+        Trials object in the shape of table written to excel,
			
 
				+        should contain the iteration, pipeline (as str),
			
 
				+        hyperparamters (as str), self.best_result (see self._objective method)
			
 
				+        as well as additional information defined by self.default_summary
			
 
				+        """
			
 
				+        pass
			
--- a/cdplib/unit_tests/TestFlattenData.py
+++ b/cdplib/unit_tests/TestFlattenData.py
--- a/cdplib/unit_tests/TestLog.py
+++ b/cdplib/unit_tests/TestLog.py
--- a/cdplib/unit_tests/TestMongodbHandler.py
+++ b/cdplib/unit_tests/TestMongodbHandler.py
--- a/cdplib/unit_tests/invalid_test_schema.json
+++ b/cdplib/unit_tests/invalid_test_schema.json
--- a/cdplib/unit_tests/valid_test_schema.json
+++ b/cdplib/unit_tests/valid_test_schema.json
--- a/cdplib/utils/CleaningUtils.py
+++ b/cdplib/utils/CleaningUtils.py
@@ -8,13 +8,16 @@ Created on Fri Sep 27 16:20:03 2019
 
				 
			
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				+from typing import Union, List
			
 
				 
			
 
				 
			
 
				 class CleaningUtils:
			
 
				     '''
			
 
				     Unites different methods for data cleaning
			
 
				     '''
			
 
				-    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
			
 
				+    def convert_dates(self,
			
 
				+                      series: pd.Series,
			
 
				+                      formats: Union[str, List[str]]) -> pd.Series:
			
 
				         '''
			
 
				         Converts values from string to date in a pandas Series
			
 
				          where possibly multiple date formats are mixed
			
@@ -29,8 +32,7 @@ class CleaningUtils:
 
				 
			
 
				                 series = series.astype(str)
			
 
				 
			
 
				-                series.loc[missing_leading_zero] = "0" +\
			
 
				-                    series.loc[missing_leading_zero]
			
 
				+                series.loc[missing_leading_zero] += "0"
			
 
				 
			
 
				             converted_this_format = pd.to_datetime(series,
			
 
				                                                    format=formt,
			
@@ -71,21 +73,28 @@ class CleaningUtils:
 
				 
			
 
				         return s
			
 
				 
			
 
				-    def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
			
 
				+    def melt_duplicated_columns(self, df: pd.DataFrame,
			
 
				+                                suffix: str = "",
			
 
				+                                prefix: str = "") -> pd.DataFrame:
			
 
				         '''
			
 
				         If a dataframe has multiple columns with the same name
			
 
				          (up to a prefix or a suffix),
			
 
				          melts the columns together in one
			
 
				 
			
 
				-        :parame suffix: string or regex up to which we consider names as duplicated
			
 
				-        :parame prefix: string or regex up to which we consider names as duplicated
			
 
				+        :parame suffix: string or regex up
			
 
				+            to which we consider names as duplicated
			
 
				+        :parame prefix: string or rege
			
 
				+            up to which we consider names as duplicated
			
 
				         '''
			
 
				         from collections import Counter
			
 
				 
			
 
				         import re
			
 
				 
			
 
				-        # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
			
 
				-        df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
			
 
				+        # remove the suffix and the prefix from the column names
			
 
				+        # (now the duplicates are truely duplicates)
			
 
				+        df.columns = [re.sub(re.compile(prefix), "",
			
 
				+                             re.sub(re.compile(suffix), "", c))
			
 
				+                      for c in df.columns]
			
 
				 
			
 
				         column_counter = Counter(df.columns)
			
 
				 
			
@@ -100,10 +109,12 @@ class CleaningUtils:
 
				             df_melted = []
			
 
				 
			
 
				             for dup_var in dup_vars:
			
 
				-                dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
			
 
				+                dup_var_melted = pd.melt(frame=df,
			
 
				+                                         id_vars=id_vars,
			
 
				+                                         value_vars=[dup_var],
			
 
				+                                         value_name=dup_var)\
			
 
				                                    .set_index(id_vars)[dup_var]
			
 
				 
			
 
				                 df_melted.append(dup_var_melted)
			
 
				 
			
 
				             return pd.concat(df_melted, axis=1, sort=False).reset_index()
			
 
				-
			
--- a/cdplib/utils/ExceptionsHandler.py
+++ b/cdplib/utils/ExceptionsHandler.py
@@ -8,35 +8,45 @@ Created on Fri Sep 27 14:20:58 2019
 
				 
			
 
				 import os
			
 
				 import sys
			
 
				-import logging
			
 
				 import pandas as pd
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
 
				 
			
 
				 class ExceptionsHandler:
			
 
				     '''
			
 
				     '''
			
 
				-    def __init__(self):
			
 
				+    def __init__(self, logger: Log = None):
			
 
				         '''
			
 
				         '''
			
 
				+        self._logger = logger or Log("ExceptionHandler")
			
 
				 
			
 
				-    def check_is_file(self, path, logger=None):
			
 
				+    def check_is_file(self, path: str):
			
 
				         '''
			
 
				         '''
			
 
				-        if logger is None:
			
 
				-            logger = logging.getLogger()
			
 
				-
			
 
				         if not os.path.isfile(path):
			
 
				             err = "File {} not found".format(path)
			
 
				-            logger.error(err)
			
 
				+            self._logger.error(err)
			
 
				             raise FileNotFoundError(err)
			
 
				 
			
 
				-    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
			
 
				-                               error_or_warning: str, logger = None):
			
 
				+    def assert_is_directory(self, path: str):
			
 
				+        ""
			
 
				+        ""
			
 
				+        assert(isinstance(path, str)),\
			
 
				+            "Parameter 'path' must of str type"
			
 
				+
			
 
				+        dirname = os.path.dirname("path")
			
 
				+
			
 
				+        if len(dirname) > 0:
			
 
				+            os.mkdir(dirname, exists_ok=True)
			
 
				+
			
 
				+    def _check_column_abscence(self,
			
 
				+                               columns: (str, list),
			
 
				+                               data: pd.DataFrame,
			
 
				+                               error_or_warning: str):
			
 
				         '''
			
 
				         '''
			
 
				-        if logger is None:
			
 
				-            logger = logging.getLogger()
			
 
				         if isinstance(columns, str):
			
 
				             columns = [columns]
			
 
				 
			
@@ -44,23 +54,23 @@ class ExceptionsHandler:
 
				 
			
 
				             if column not in data.columns:
			
 
				                 err = ("{} is not an internal column name".format(column))
			
 
				-                getattr(logger, error_or_warning)(err)
			
 
				+                getattr(self._logger, error_or_warning)(err)
			
 
				 
			
 
				                 if error_or_warning == "error":
			
 
				                     raise Exception(err)
			
 
				 
			
 
				-    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
			
 
				+    def error_column_abscence(self,
			
 
				+                              columns: (str, list),
			
 
				+                              data: pd.DataFrame):
			
 
				         '''
			
 
				         '''
			
 
				         return self._check_column_abscence(columns=columns,
			
 
				                                            data=data,
			
 
				-                                           error_or_warning="error",
			
 
				-                                           logger=logger)
			
 
				+                                           error_or_warning="error")
			
 
				 
			
 
				-    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
			
 
				+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
			
 
				         '''
			
 
				         '''
			
 
				         return self._check_column_abscence(columns=columns,
			
 
				                                            data=data,
			
 
				-                                           error_or_warning="warning",
			
 
				-                                           logger=logger)
			
 
				+                                           error_or_warning="warning")
			
--- a/cdplib/utils/LoadingUtils.py
+++ b/cdplib/utils/LoadingUtils.py
@@ -0,0 +1,46 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Oct  1 12:58:58 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: class for methods of loading data from external sources
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+class LoadingUtils:
			
 
				+    """
			
 
				+    """
			
 
				+    def __init__(self, logger=None):
			
 
				+        """
			
 
				+        """
			
 
				+        self._logger = logger or Log("LoadingUtils")
			
 
				+
			
 
				+    def load_from_module(self, module_path: str, name: str):
			
 
				+        """
			
 
				+        """
			
 
				+        for p in ["modele_path", "name"]:
			
 
				+            assert(isinstance(p, str)),\
			
 
				+                "Parameter '{}' must be of str type".format(p)
			
 
				+
			
 
				+            assert(os.path.isfile(module_path)),\
			
 
				+                "Parameter 'module_path' must be a valid file"
			
 
				+
			
 
				+            module, extension = os.path.splitext(os.path.basename(module_path))
			
 
				+
			
 
				+            assert(extension == ",py"),\
			
 
				+                "Parameter 'space' must be read from a python file"
			
 
				+
			
 
				+            sys.path.insert(module_path)
			
 
				+
			
 
				+            try:
			
 
				+                from module import name
			
 
				+                return name
			
 
				+
			
 
				+            except ImportError:
			
 
				+                err = "Invalid space location or name"
			
 
				+                self._logger.log_and_raise_error(err)
			
--- a/cdplib/utils/TypeConverter.py
+++ b/cdplib/utils/TypeConverter.py
@@ -0,0 +1,36 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Apr 24 09:06:13 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+class TypeConverter:
			
 
				+    """
			
 
				+    Library for methods to manage python types
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        """
			
 
				+        """
			
 
				+        from cdplib.log import Log
			
 
				+
			
 
				+        self._logger = Log("TypeConverter")
			
 
				+
			
 
				+    def convert_to_ndarray(self, x: (pd.DataFrame, np.ndarray)) -> np.ndarray:
			
 
				+        '''
			
 
				+        Converts an DataFrame to an numpy array.
			
 
				+        '''
			
 
				+        if isinstance(x, np.ndarray):
			
 
				+            return x
			
 
				+
			
 
				+        elif (isinstance(x, pd.core.frame.DataFrame))\
			
 
				+                or (isinstance(x, pd.core.series.Series)):
			
 
				+            return x.values
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.log_and_raise_error_stack_info(
			
 
				+                    'The argument must be a numpy array or a pandas DataFrame')
			
--- a/cdplib/utils/__init__.py
+++ b/cdplib/utils/__init__.py
--- a/classes.png
+++ b/classes.png
--- a/hooks/README.txt
+++ b/hooks/README.txt
--- a/hooks/pre-commit
+++ b/hooks/pre-commit
--- a/packages.png
+++ b/packages.png
--- a/setup.py
+++ b/setup.py
--- a/tests/testSQLOperations.py
+++ b/tests/testSQLOperations.py
@@ -0,0 +1,115 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 19 14:34:22 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import unittest
			
 
				+import pandas as pd
			
 
				+
			
 
				+from libraries.database_operations_library import SQLOperations
			
 
				+
			
 
				+
			
 
				+class TestSQLOperations(unittest.TestCase):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, test_df = None):
			
 
				+        print('\n', '='*5, 'Testing class : SQLOperations', '='*5)
			
 
				+        self.inst = SQLOperations(db_url = None)
			
 
				+        print('Connected to', self.inst.db_url)
			
 
				+        
			
 
				+        if test_df is None:
			
 
				+            self.test_df = pd.DataFrame({'a' : [1,2,3,4,5], 'b' : ['A', 'B', 'C', 'A', 'V'], 'c' : [0.1, 0.2, 0.3, 0.4, 0.5]})
			
 
				+        else:
			
 
				+            self.test_df = test_df
			
 
				+
			
 
				+        
			
 
				+    def _create_test_table(self, test_tablename, create_table_query = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self.inst.drop_table_if_exists(test_tablename)        
			
 
				+        
			
 
				+        if create_table_query is None:
			
 
				+            if 'ibm_db' in self.inst.db_url:
			
 
				+                create_table_query = """CREATE TABLE {} (
			
 
				+                                        a INT,
			
 
				+                                        b CHAR,
			
 
				+                                        c DECIMAL(10 , 2 )
			
 
				+                                        );""".format(test_tablename)
			
 
				+            else:
			
 
				+                create_table_query = """CREATE TABLE test (
			
 
				+                                        a INT,
			
 
				+                                        b TEXT,
			
 
				+                                        c DECIMAL(10 , 2 )
			
 
				+                                        );"""
			
 
				+                
			
 
				+        self.inst.execute(create_table_query)
			
 
				+
			
 
				+
			
 
				+        
			
 
				+class TestExecute(TestSQLOperations):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        super(TestExecute, self).__init__()
			
 
				+        print('\n', '-'*2, 'Testing method : execute')
			
 
				+        
			
 
				+    def test_create_table(self, test_tablename, create_table_query = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        print('-'*4, 'Testing create table operation')
			
 
				+        self._create_test_table(test_tablename = test_tablename, create_table_query = create_table_query)
			
 
				+        self.assertTrue(self.inst.check_if_table_exists(test_tablename))
			
 
				+        self.inst.drop_table_if_exists(test_tablename)
			
 
				+        print('-'*4, 'Test ran successfully!')
			
 
				+        
			
 
				+class TestLoad_csv_to_db(TestSQLOperations):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        super(TestLoad_csv_to_db, self).__init__()
			
 
				+        print('\n', '-'*2, 'Testing method : load_csv_to_db')
			
 
				+    
			
 
				+    def test_correct_content(self, test_csv_path, test_tablename, create_table_query = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        print('-'*4, 'Testig that load operation gives correct result')
			
 
				+        os.makedirs(os.path.dirname(test_csv_path), exist_ok = True)
			
 
				+        if not self.inst.drop_table_if_exists(test_tablename):
			
 
				+            self._create_test_table(test_tablename)
			
 
				+    
			
 
				+        self.test_df.to_csv(test_csv_path, index = False)
			
 
				+        self.inst.load_csv_to_db(csv_path = test_csv_path, tablename = test_tablename)
			
 
				+        try:
			
 
				+            connection = self.inst.engine.connect()
			
 
				+            test_df_from_sql = pd.read_sql(sql = "SELECT * FROM test", con = connection)
			
 
				+            connection.close()
			
 
				+        except Exception as e:
			
 
				+            raise Exception('ERROR: test csv file has not been load to sql at all, \n, exit with {}'.format(e))
			
 
				+        
			
 
				+        print('-'*4, 'Testing data has correct shape')
			
 
				+        self.assertTupleEqual(self.test_df.shape, test_df_from_sql.shape)
			
 
				+        
			
 
				+        print('-'*4,'Testing data has correct columns')
			
 
				+        self.assertSetEqual(set(self.test_df.columns), set(test_df_from_sql.columns))
			
 
				+        
			
 
				+        print('-'*4,'Testing data has correct content')
			
 
				+        for col in self.test_df.columns:
			
 
				+            test_df_from_sql[col] = test_df_from_sql[col].astype(self.test_df[col].dtype)
			
 
				+        pd.testing.assert_frame_equal(self.test_df, test_df_from_sql)
			
 
				+        
			
 
				+        print('-'*4, 'Test ran successfully!')
			
 
				+
			
 
				+                        
			
 
				+if __name__ == '__main__':
			
 
				+    
			
 
				+    test_tablename = 'test10'
			
 
				+    test_csv_path = '/home/tanya/acdp/data_samples/test.csv'
			
 
				+    
			
 
				+    TestExecute().test_create_table(test_tablename = test_tablename)
			
 
				+    TestLoad_csv_to_db().test_correct_content(test_csv_path = test_csv_path, test_tablename = test_tablename)
			
 
				+    
			
 
				+    print('Done!', '\n')
			
--- a/tests/testStatisticalFeatures.py
+++ b/tests/testStatisticalFeatures.py
@@ -0,0 +1,177 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Oct 18 16:26:47 2018
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import unittest
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from pandas.util.testing import assert_frame_equal
			
 
				+
			
 
				+from libraries.feature_engineering.in_memory_feature_engineering.StatisticalFeatures import StatisticalFeatures
			
 
				+from libraries.logging.logging_utils import configure_logging
			
 
				+
			
 
				+
			
 
				+class TestStatisticalFeatures(unittest.TestCase):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, data = None, index_cols = None, path_to_log = None):
			
 
				+        '''
			
 
				+        '''        
			
 
				+        if index_cols is None:
			
 
				+            self.index_cols = ['id1', 'id2']
			
 
				+        else:
			
 
				+            self.index_cols = index_cols
			
 
				+        
			
 
				+        if data is None:
			
 
				+            self.data = pd.DataFrame({'int' : [1,2,3,2,55,3,7],
			
 
				+                                                     'float' : [0.1, 7, 0.1, 99.9, 99.9, np.nan, 7],
			
 
				+                                                     'str' : ['a', np.nan, 'c', 'a', 'a', '', 'c'],
			
 
				+                                                     'datetime' : [pd.datetime(2017, 1, 2), np.nan, pd.datetime(2017, 5, 3), pd.datetime(2017, 1, 4),
			
 
				+                                                                   '2018-01-19', pd.datetime(2018, 1, 4), pd.datetime(2019, 3, 23)],              
			
 
				+                                                     'nan' : [np.nan]*7,
			
 
				+                                                     'id1' : [1,1,3,3,3,1,1],
			
 
				+                                                     'id2' : ['a', 'a', 'b', 'b', 'a', 'a', np.nan]})\
			
 
				+                                                     .sort_values(by = self.index_cols)
			
 
				+        else:
			
 
				+            self.data = data
			
 
				+            
			
 
				+        
			
 
				+        self.obj = StatisticalFeatures(data = self.data, index_cols = self.index_cols, path_to_log = path_to_log)
			
 
				+            
			
 
				+class TestKpisByAggregation(TestStatisticalFeatures):
			
 
				+    '''
			
 
				+    '''
			
 
				+    def __init__(self, data = None, index_cols = None, path_to_log = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        super(TestKpisByAggregation, self).__init__(data = data, index_cols = index_cols, path_to_log = path_to_log)
			
 
				+    
			
 
				+    def test_builtin_aggfuncs_numeric_cols(self, answer = None, kpis = None):
			
 
				+        '''Tests the expected behaviour of pandas builtin aggregation function,
			
 
				+           in particular behaviour with missing values
			
 
				+           
			
 
				+           :param DataFrame data:
			
 
				+           :param list index_cols:
			
 
				+           :param DataFrame answer:
			
 
				+           :param list of tuples or dict kpis:    
			
 
				+        '''
			
 
				+        kpis = kpis or [('int', ['min', 'std']),
			
 
				+                        ('float', ['mean', np.sum]),
			
 
				+                        ('float', 'sum'),
			
 
				+                        ('nan', 'mean')]
			
 
				+            
			
 
				+        
			
 
				+        answer = answer or pd.DataFrame([
			
 
				+                            {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0]), 'float_sum' : 7.1, 'nan_mean' : np.nan},
			
 
				+                            {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9]), 'float_sum' : 100, 'nan_mean' : np.nan},
			
 
				+                            {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9, 'float_sum' : 99.9, 'nan_mean' : np.nan},
			
 
				+                            ]).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+            
			
 
				+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+        
			
 
				+        assert_frame_equal(result, answer[result.columns])
			
 
				+        
			
 
				+        
			
 
				+    def test_dict_kpi(self, kpis = None, answer = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        kpis = kpis or {'int' : ['min', 'std'], 'float' : 'mean'}
			
 
				+            
			
 
				+        answer = answer or pd.DataFrame([
			
 
				+                            {'id1' : 1, 'id2' : 'a', 'int_min' : 1, 'int_std' : pd.Series([1,2,3]).std(), 'float_mean' : np.mean([0.1, 7.0])},
			
 
				+                            {'id1' : 3, 'id2' : 'b', 'int_min' : 2, 'int_std' : pd.Series([2,3]).std(), 'float_mean' : np.mean([0.1, 99.9])},
			
 
				+                            {'id1' : 3, 'id2' : 'a', 'int_min' : 55, 'int_std' : np.nan, 'float_mean' : 99.9},
			
 
				+                            ]).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+                
			
 
				+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+        
			
 
				+        assert_frame_equal(result, answer[result.columns])
			
 
				+        
			
 
				+        
			
 
				+    def test_string_cols(self, kpis = None, answer = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        kpis = kpis or {'str' : ['sum']}
			
 
				+            
			
 
				+        answer = answer or pd.DataFrame([
			
 
				+                            {'id1' : 1, 'id2' : 'a', 'str_sum' : 'anan'},
			
 
				+                            {'id1' : 3, 'id2' : 'b', 'str_sum' : 'ca'},
			
 
				+                            {'id1' : 3, 'id2' : 'a', 'str_sum' : 'a'},
			
 
				+                            ]).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+                
			
 
				+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+        
			
 
				+        assert_frame_equal(result, answer[result.columns])
			
 
				+        
			
 
				+        
			
 
				+    def test_custom_aggfunc(self, kpis, answer = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        
			
 
				+        if kpis is None:
			
 
				+            def custom_sum(x):
			
 
				+                return np.sum(x)
			
 
				+            
			
 
				+            kpis = {'int' : custom_sum}
			
 
				+        
			
 
				+        answer = answer or pd.DataFrame([
			
 
				+                           {'id1' : 1, 'id2' : 'a', 'int_custom_sum' : 6},
			
 
				+                           {'id1' : 3, 'id2' : 'b', 'int_custom_sum' : 55},
			
 
				+                           {'id1' : 3, 'id2' : 'a', 'int_custom_sum' : 5},
			
 
				+                           ]).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+                
			
 
				+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+        
			
 
				+        assert_frame_equal(result, answer[result.columns])
			
 
				+            
			
 
				+        
			
 
				+    def test_some_wrong_col(self, kpis = None, answer = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        kpis = kpis or {'bla' : 'sum', 'int' : 'sum'}
			
 
				+            
			
 
				+        answer = answer or pd.DataFrame([
			
 
				+                   {'id1' : 1, 'id2' : 'a', 'int_sum' : 6},
			
 
				+                   {'id1' : 3, 'id2' : 'a', 'int_sum' : 55},
			
 
				+                   {'id1' : 3, 'id2' : 'b', 'int_sum' : 5},
			
 
				+                   ]).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+                
			
 
				+        result = self.obj.get_kpis_by_aggregation(kpis = kpis).sort_values(self.index_cols).set_index(self.index_cols)
			
 
				+                
			
 
				+        assert_frame_equal(result, answer[result.columns])
			
 
				+        
			
 
				+    def test_all_wrong_cols(self, kpis = None, answer = None):
			
 
				+        '''
			
 
				+        '''
			
 
				+        kpis = kpis or {'bla' : 'sum', 'blub' : 'sum'}
			
 
				+            
			
 
				+        result = self.obj.get_kpis_by_aggregation(kpis = kpis)
			
 
				+            
			
 
				+        answer = self.data[self.index_cols].drop_duplicates().reset_index(drop = True)
			
 
				+                
			
 
				+        assert_frame_equal(result, answer[result.columns])
			
 
				+        
			
 
				+if __name__ == '__main__':
			
 
				+    
			
 
				+    path_to_log = os.path.join(os.environ.get('PROJECT_DIR'),
			
 
				+                               'tests', 'test_feature_engineering','test_in_memory_feature_engineering',
			
 
				+                               'test_kpis_by_aggregation.log')
			
 
				+    
			
 
				+    configure_logging(path_to_log)
			
 
				+    logger = logging.getLogger(__name__)
			
 
				+
			
 
				+    inst = TestKpisByAggregation(path_to_log = path_to_log)
			
 
				+    inst.test_builtin_aggfuncs_numeric_cols()
			
 
				+    inst.test_dict_kpi()
			
 
				+    inst.test_string_cols()
			
 
				+    inst.test_some_wrong_col()
			
 
				+    inst.test_all_wrong_cols()
			
 
				+        
			
 
				+    logger.info('Done testing method get_kpis_by_aggregation!')