4 年之前 · 60abdd6dc6
--- a/Pipfile
+++ b/Pipfile
@@ -9,12 +9,12 @@ verify_ssl = true
 
				 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
			
 
				 pandas = "!=0.24.0"
			
 
				 sqlalchemy = "*"
			
 
				-sqlparse = "*"
			
 
				-pymysql = "*"
			
 
				+# sqlparse = "*"
			
 
				+# pymysql = "*"
			
 
				 pymongo = "*"
			
 
				 jsonref = "*"
			
 
				 simplejson = "*"
			
 
				-mysql = "*"
			
 
				+# mysql = "*"
			
 
				 hyperopt = "*"
			
 
				 influxdb = "*"
			
 
				 
			
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
 
				 {
			
 
				     "_meta": {
			
 
				         "hash": {
			
 
				-            "sha256": "1879ebbd4ee3fe44d9e59091889a69ead4c7b76e81b70de0dd74d12b5266cf42"
			
 
				+            "sha256": "194ac762a9255b24372b56ca5cd3def753b14c78784407b69cba906417bc53b2"
			
 
				         },
			
 
				         "pipfile-spec": 6,
			
 
				         "requires": {
			
@@ -26,7 +26,7 @@
 
				         "cdplib": {
			
 
				             "editable": true,
			
 
				             "git": "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git",
			
 
				-            "ref": "623f7488557e373eb3181bb4099295ed17a53b5c"
			
 
				+            "ref": "362220280f6768abda363240b5fce51eb4d9016e"
			
 
				         },
			
 
				         "certifi": {
			
 
				             "hashes": [
			
@@ -130,14 +130,6 @@
 
				             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				             "version": "==2.10"
			
 
				         },
			
 
				-        "importlib-metadata": {
			
 
				-            "hashes": [
			
 
				-                "sha256:c9db46394197244adf2f0b08ec5bc3cf16757e9590b02af1fca085c16c0d600a",
			
 
				-                "sha256:d2d46ef77ffc85cbf7dac7e81dd663fde71c45326131bea8033b9bad42268ebe"
			
 
				-            ],
			
 
				-            "markers": "python_version < '3.8'",
			
 
				-            "version": "==3.10.0"
			
 
				-        },
			
 
				         "influxdb": {
			
 
				             "hashes": [
			
 
				                 "sha256:46f85e7b04ee4b3dee894672be6a295c94709003a7ddea8820deec2ac4d8b27a",
			
@@ -199,7 +191,6 @@
 
				             "hashes": [
			
 
				                 "sha256:55e66b5e7b3823b1da5fb2a063e95a628fb850b2a0b76bdcd884faac5d2daa7d"
			
 
				             ],
			
 
				-            "index": "pypi",
			
 
				             "version": "==0.0.2"
			
 
				         },
			
 
				         "mysqlclient": {
			
@@ -253,33 +244,25 @@
 
				         },
			
 
				         "pandas": {
			
 
				             "hashes": [
			
 
				-                "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b",
			
 
				-                "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f",
			
 
				-                "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648",
			
 
				-                "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb",
			
 
				-                "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98",
			
 
				-                "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a",
			
 
				-                "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11",
			
 
				-                "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a",
			
 
				-                "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e",
			
 
				-                "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae",
			
 
				-                "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d",
			
 
				-                "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9",
			
 
				-                "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b",
			
 
				-                "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788",
			
 
				-                "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca",
			
 
				-                "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5",
			
 
				-                "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a",
			
 
				-                "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814",
			
 
				-                "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d",
			
 
				-                "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086",
			
 
				-                "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a",
			
 
				-                "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb",
			
 
				-                "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782",
			
 
				-                "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"
			
 
				+                "sha256:167693a80abc8eb28051fbd184c1b7afd13ce2c727a5af47b048f1ea3afefff4",
			
 
				+                "sha256:2111c25e69fa9365ba80bbf4f959400054b2771ac5d041ed19415a8b488dc70a",
			
 
				+                "sha256:298f0553fd3ba8e002c4070a723a59cdb28eda579f3e243bc2ee397773f5398b",
			
 
				+                "sha256:2b063d41803b6a19703b845609c0b700913593de067b552a8b24dd8eeb8c9895",
			
 
				+                "sha256:2cb7e8f4f152f27dc93f30b5c7a98f6c748601ea65da359af734dd0cf3fa733f",
			
 
				+                "sha256:52d2472acbb8a56819a87aafdb8b5b6d2b3386e15c95bde56b281882529a7ded",
			
 
				+                "sha256:612add929bf3ba9d27b436cc8853f5acc337242d6b584203f207e364bb46cb12",
			
 
				+                "sha256:649ecab692fade3cbfcf967ff936496b0cfba0af00a55dfaacd82bdda5cb2279",
			
 
				+                "sha256:68d7baa80c74aaacbed597265ca2308f017859123231542ff8a5266d489e1858",
			
 
				+                "sha256:8d4c74177c26aadcfb4fd1de6c1c43c2bf822b3e0fc7a9b409eeaf84b3e92aaa",
			
 
				+                "sha256:971e2a414fce20cc5331fe791153513d076814d30a60cd7348466943e6e909e4",
			
 
				+                "sha256:9db70ffa8b280bb4de83f9739d514cd0735825e79eef3a61d312420b9f16b758",
			
 
				+                "sha256:b730add5267f873b3383c18cac4df2527ac4f0f0eed1c6cf37fcb437e25cf558",
			
 
				+                "sha256:bd659c11a4578af740782288cac141a322057a2e36920016e0fc7b25c5a4b686",
			
 
				+                "sha256:c601c6fdebc729df4438ec1f62275d6136a0dd14d332fc0e8ce3f7d2aadb4dd6",
			
 
				+                "sha256:d0877407359811f7b853b548a614aacd7dea83b0c0c84620a9a643f180060950"
			
 
				             ],
			
 
				             "index": "pypi",
			
 
				-            "version": "==1.1.5"
			
 
				+            "version": "==1.2.4"
			
 
				         },
			
 
				         "pymongo": {
			
 
				             "hashes": [
			
@@ -356,7 +339,7 @@
 
				                 "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641",
			
 
				                 "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"
			
 
				             ],
			
 
				-            "index": "pypi",
			
 
				+            "markers": "python_version >= '3.6'",
			
 
				             "version": "==1.0.2"
			
 
				         },
			
 
				         "python-dateutil": {
			
@@ -384,67 +367,67 @@
 
				         },
			
 
				         "scikit-learn": {
			
 
				             "hashes": [
			
 
				-                "sha256:0567a2d29ad08af98653300c623bd8477b448fe66ced7198bef4ed195925f082",
			
 
				-                "sha256:087dfede39efb06ab30618f9ab55a0397f29c38d63cd0ab88d12b500b7d65fd7",
			
 
				-                "sha256:1adf483e91007a87171d7ce58c34b058eb5dab01b5fee6052f15841778a8ecd8",
			
 
				-                "sha256:259ec35201e82e2db1ae2496f229e63f46d7f1695ae68eef9350b00dc74ba52f",
			
 
				-                "sha256:3c4f07f47c04e81b134424d53c3f5e16dfd7f494e44fd7584ba9ce9de2c5e6c1",
			
 
				-                "sha256:4562dcf4793e61c5d0f89836d07bc37521c3a1889da8f651e2c326463c4bd697",
			
 
				-                "sha256:4ddd2b6f7449a5d539ff754fa92d75da22de261fd8fdcfb3596799fadf255101",
			
 
				-                "sha256:54be0a60a5a35005ad69c75902e0f5c9f699db4547ead427e97ef881c3242e6f",
			
 
				-                "sha256:5580eba7345a4d3b097be2f067cc71a306c44bab19e8717a30361f279c929bea",
			
 
				-                "sha256:7b04691eb2f41d2c68dbda8d1bd3cb4ef421bdc43aaa56aeb6c762224552dfb6",
			
 
				-                "sha256:826b92bf45b8ad80444814e5f4ac032156dd481e48d7da33d611f8fe96d5f08b",
			
 
				-                "sha256:83b21ff053b1ff1c018a2d24db6dd3ea339b1acfbaa4d9c881731f43748d8b3b",
			
 
				-                "sha256:8772b99d683be8f67fcc04789032f1b949022a0e6880ee7b75a7ec97dbbb5d0b",
			
 
				-                "sha256:895dbf2030aa7337649e36a83a007df3c9811396b4e2fa672a851160f36ce90c",
			
 
				-                "sha256:8aa1b3ac46b80eaa552b637eeadbbce3be5931e4b5002b964698e33a1b589e1e",
			
 
				-                "sha256:9599a3f3bf33f73fed0fe06d1dfa4e6081365a58c1c807acb07271be0dce9733",
			
 
				-                "sha256:99349d77f54e11f962d608d94dfda08f0c9e5720d97132233ebdf35be2858b2d",
			
 
				-                "sha256:9a24d1ccec2a34d4cd3f2a1f86409f3f5954cc23d4d2270ba0d03cf018aa4780",
			
 
				-                "sha256:9bed8a1ef133c8e2f13966a542cb8125eac7f4b67dcd234197c827ba9c7dd3e0",
			
 
				-                "sha256:9c6097b6a9b2bafc5e0f31f659e6ab5e131383209c30c9e978c5b8abdac5ed2a",
			
 
				-                "sha256:9dfa564ef27e8e674aa1cc74378416d580ac4ede1136c13dd555a87996e13422",
			
 
				-                "sha256:a0334a1802e64d656022c3bfab56a73fbd6bf4b1298343f3688af2151810bbdf",
			
 
				-                "sha256:a29460499c1e62b7a830bb57ca42e615375a6ab1bcad053cd25b493588348ea8",
			
 
				-                "sha256:a36e159a0521e13bbe15ca8c8d038b3a1dd4c7dad18d276d76992e03b92cf643",
			
 
				-                "sha256:abe835a851610f87201819cb315f8d554e1a3e8128912783a31e87264ba5ffb7",
			
 
				-                "sha256:c13ebac42236b1c46397162471ea1c46af68413000e28b9309f8c05722c65a09",
			
 
				-                "sha256:c3deb3b19dd9806acf00cf0d400e84562c227723013c33abefbbc3cf906596e9",
			
 
				-                "sha256:c658432d8a20e95398f6bb95ff9731ce9dfa343fdf21eea7ec6a7edfacd4b4d9",
			
 
				-                "sha256:c7f4eb77504ac586d8ac1bde1b0c04b504487210f95297235311a0ab7edd7e38",
			
 
				-                "sha256:d54dbaadeb1425b7d6a66bf44bee2bb2b899fe3e8850b8e94cfb9c904dcb46d0",
			
 
				-                "sha256:ddb52d088889f5596bc4d1de981f2eca106b58243b6679e4782f3ba5096fd645",
			
 
				-                "sha256:ed9d65594948678827f4ff0e7ae23344e2f2b4cabbca057ccaed3118fdc392ca",
			
 
				-                "sha256:fab31f48282ebf54dd69f6663cd2d9800096bad1bb67bbc9c9ac84eb77b41972"
			
 
				+                "sha256:038f4e9d6ef10e1f3fe82addc3a14735c299866eb10f2c77c090410904828312",
			
 
				+                "sha256:06ffdcaaf81e2a3b1b50c3ac6842cfb13df2d8b737d61f64643ed61da7389cde",
			
 
				+                "sha256:0e71ce9c7cbc20f6f8b860107ce15114da26e8675238b4b82b7e7cd37ca0c087",
			
 
				+                "sha256:1eec963fe9ffc827442c2e9333227c4d49749a44e592f305398c1db5c1563393",
			
 
				+                "sha256:2754c85b2287333f9719db7f23fb7e357f436deed512db3417a02bf6f2830aa5",
			
 
				+                "sha256:2db429090b98045d71218a9ba913cc9b3fe78e0ba0b6b647d8748bc6d5a44080",
			
 
				+                "sha256:39b7e3b71bcb1fe46397185d6c1a5db1c441e71c23c91a31e7ad8cc3f7305f9a",
			
 
				+                "sha256:3cbd734e1aefc7c5080e6b6973fe062f97c26a1cdf1a991037ca196ce1c8f427",
			
 
				+                "sha256:40556bea1ef26ef54bc678d00cf138a63069144a0b5f3a436eecd8f3468b903e",
			
 
				+                "sha256:48f273836e19901ba2beecd919f7b352f09310ce67c762f6e53bc6b81cacf1f0",
			
 
				+                "sha256:49ec0b1361da328da9bb7f1a162836028e72556356adeb53342f8fae6b450d47",
			
 
				+                "sha256:4e6198675a6f9d333774671bd536668680eea78e2e81c0b19e57224f58d17f37",
			
 
				+                "sha256:5beaeb091071625e83f5905192d8aecde65ba2f26f8b6719845bbf586f7a04a1",
			
 
				+                "sha256:5ff3e4e4cf7592d36541edec434e09fb8ab9ba6b47608c4ffe30c9038d301897",
			
 
				+                "sha256:62214d2954377fcf3f31ec867dd4e436df80121e7a32947a0b3244f58f45e455",
			
 
				+                "sha256:7be1b88c23cfac46e06404582215a917017cd2edaa2e4d40abe6aaff5458f24b",
			
 
				+                "sha256:8fac72b9688176922f9f54fda1ba5f7ffd28cbeb9aad282760186e8ceba9139a",
			
 
				+                "sha256:90a297330f608adeb4d2e9786c6fda395d3150739deb3d42a86d9a4c2d15bc1d",
			
 
				+                "sha256:a2a47449093dcf70babc930beba2ca0423cb7df2fa5fd76be5260703d67fa574",
			
 
				+                "sha256:ae19ac105cf7ce8c205a46166992fdec88081d6e783ab6e38ecfbe45729f3c39",
			
 
				+                "sha256:ae426e3a52842c6b6d77d00f906b6031c8c2cfdfabd6af7511bb4bc9a68d720e",
			
 
				+                "sha256:cbdb0b3db99dd1d5f69d31b4234367d55475add31df4d84a3bd690ef017b55e2",
			
 
				+                "sha256:cdf24c1b9bbeb4936456b42ac5bd32c60bb194a344951acb6bfb0cddee5439a4",
			
 
				+                "sha256:d14701a12417930392cd3898e9646cf5670c190b933625ebe7511b1f7d7b8736",
			
 
				+                "sha256:d177fe1ff47cc235942d628d41ee5b1c6930d8f009f1a451c39b5411e8d0d4cf",
			
 
				+                "sha256:d5bf9c863ba4717b3917b5227463ee06860fc43931dc9026747de416c0a10fee",
			
 
				+                "sha256:dd968a174aa82f3341a615a033fa6a8169e9320cbb46130686562db132d7f1f0",
			
 
				+                "sha256:f0ed4483c258fb23150e31b91ea7d25ff8495dba108aea0b0d4206a777705350",
			
 
				+                "sha256:f18c3ed484eeeaa43a0d45dc2efb4d00fc6542ccdcfa2c45d7b635096a2ae534",
			
 
				+                "sha256:f1d2108e770907540b5248977e4cff9ffaf0f73d0d13445ee938df06ca7579c6",
			
 
				+                "sha256:f3ec00f023d84526381ad0c0f2cff982852d035c921bbf8ceb994f4886c00c64",
			
 
				+                "sha256:f74429a07fedb36a03c159332b914e6de757176064f9fed94b5f79ebac07d913",
			
 
				+                "sha256:fec42690a2eb646b384eafb021c425fab48991587edb412d4db77acc358b27ce"
			
 
				             ],
			
 
				             "markers": "python_version >= '3.6'",
			
 
				-            "version": "==0.24.1"
			
 
				+            "version": "==0.24.2"
			
 
				         },
			
 
				         "scipy": {
			
 
				             "hashes": [
			
 
				-                "sha256:03f1fd3574d544456325dae502facdf5c9f81cbfe12808a5e67a737613b7ba8c",
			
 
				-                "sha256:0c81ea1a95b4c9e0a8424cf9484b7b8fa7ef57169d7bcc0dfcfc23e3d7c81a12",
			
 
				-                "sha256:1fba8a214c89b995e3721670e66f7053da82e7e5d0fe6b31d8e4b19922a9315e",
			
 
				-                "sha256:37f4c2fb904c0ba54163e03993ce3544c9c5cde104bcf90614f17d85bdfbb431",
			
 
				-                "sha256:50e5bcd9d45262725e652611bb104ac0919fd25ecb78c22f5282afabd0b2e189",
			
 
				-                "sha256:6ca1058cb5bd45388041a7c3c11c4b2bd58867ac9db71db912501df77be2c4a4",
			
 
				-                "sha256:77f7a057724545b7e097bfdca5c6006bed8580768cd6621bb1330aedf49afba5",
			
 
				-                "sha256:816951e73d253a41fa2fd5f956f8e8d9ac94148a9a2039e7db56994520582bf2",
			
 
				-                "sha256:96620240b393d155097618bcd6935d7578e85959e55e3105490bbbf2f594c7ad",
			
 
				-                "sha256:993c86513272bc84c451349b10ee4376652ab21f312b0554fdee831d593b6c02",
			
 
				-                "sha256:adf7cee8e5c92b05f2252af498f77c7214a2296d009fc5478fc432c2f8fb953b",
			
 
				-                "sha256:bc52d4d70863141bb7e2f8fd4d98e41d77375606cde50af65f1243ce2d7853e8",
			
 
				-                "sha256:c1d3f771c19af00e1a36f749bd0a0690cc64632783383bc68f77587358feb5a4",
			
 
				-                "sha256:d744657c27c128e357de2f0fd532c09c84cd6e4933e8232895a872e67059ac37",
			
 
				-                "sha256:e3e9742bad925c421d39e699daa8d396c57535582cba90017d17f926b61c1552",
			
 
				-                "sha256:e547f84cd52343ac2d56df0ab08d3e9cc202338e7d09fafe286d6c069ddacb31",
			
 
				-                "sha256:e89091e6a8e211269e23f049473b2fde0c0e5ae0dd5bd276c3fc91b97da83480",
			
 
				-                "sha256:e9da33e21c9bc1b92c20b5328adb13e5f193b924c9b969cd700c8908f315aa59",
			
 
				-                "sha256:ffdfb09315896c6e9ac739bb6e13a19255b698c24e6b28314426fd40a1180822"
			
 
				+                "sha256:01b38dec7e9f897d4db04f8de4e20f0f5be3feac98468188a0f47a991b796055",
			
 
				+                "sha256:10dbcc7de03b8d635a1031cb18fd3eaa997969b64fdf78f99f19ac163a825445",
			
 
				+                "sha256:19aeac1ad3e57338723f4657ac8520f41714804568f2e30bd547d684d72c392e",
			
 
				+                "sha256:1b21c6e0dc97b1762590b70dee0daddb291271be0580384d39f02c480b78290a",
			
 
				+                "sha256:1caade0ede6967cc675e235c41451f9fb89ae34319ddf4740194094ab736b88d",
			
 
				+                "sha256:23995dfcf269ec3735e5a8c80cfceaf384369a47699df111a6246b83a55da582",
			
 
				+                "sha256:2a799714bf1f791fb2650d73222b248d18d53fd40d6af2df2c898db048189606",
			
 
				+                "sha256:3274ce145b5dc416c49c0cf8b6119f787f0965cd35e22058fe1932c09fe15d77",
			
 
				+                "sha256:33d1677d46111cfa1c84b87472a0274dde9ef4a7ef2e1f155f012f5f1e995d8f",
			
 
				+                "sha256:44d452850f77e65e25b1eb1ac01e25770323a782bfe3a1a3e43847ad4266d93d",
			
 
				+                "sha256:9e3302149a369697c6aaea18b430b216e3c88f9a61b62869f6104881e5f9ef85",
			
 
				+                "sha256:a75b014d3294fce26852a9d04ea27b5671d86736beb34acdfc05859246260707",
			
 
				+                "sha256:ad7269254de06743fb4768f658753de47d8b54e4672c5ebe8612a007a088bd48",
			
 
				+                "sha256:b30280fbc1fd8082ac822994a98632111810311a9ece71a0e48f739df3c555a2",
			
 
				+                "sha256:b79104878003487e2b4639a20b9092b02e1bad07fc4cf924b495cf413748a777",
			
 
				+                "sha256:d449d40e830366b4c612692ad19fbebb722b6b847f78a7b701b1e0d6cda3cc13",
			
 
				+                "sha256:d647757373985207af3343301d89fe738d5a294435a4f2aafb04c13b4388c896",
			
 
				+                "sha256:f68eb46b86b2c246af99fcaa6f6e37c7a7a413e1084a794990b877f2ff71f7b6",
			
 
				+                "sha256:fdf606341cd798530b05705c87779606fcdfaf768a8129c348ea94441da15b04"
			
 
				             ],
			
 
				             "markers": "python_version < '3.10' and python_version >= '3.7'",
			
 
				-            "version": "==1.6.2"
			
 
				+            "version": "==1.6.3"
			
 
				         },
			
 
				         "simplejson": {
			
 
				             "hashes": [
			
@@ -513,56 +496,56 @@
 
				         },
			
 
				         "sqlalchemy": {
			
 
				             "hashes": [
			
 
				-                "sha256:013b659efe02f0f58e7f759602584899c921c178c6a972978f16460dcdd782d5",
			
 
				-                "sha256:193c3ca465fbc68de071995a461ab535466f041089d372ee6a6f0aae7b9307e6",
			
 
				-                "sha256:2071ee6cd9390a9527a80ef03458fb58e0166bb299db2c62f9d688b6772d76a1",
			
 
				-                "sha256:21becd8b45ec70b703239cf915104e47889c2aad96d0f68f597b9b547cbfd787",
			
 
				-                "sha256:2713b338d9c54d2c3c7ff4f7786a40a5ca85013c8ccea00327b034d42598e22e",
			
 
				-                "sha256:2a042c27b1a32a87f4cead53bcdd28999324992650896094368a595165b31d97",
			
 
				-                "sha256:2e65c1146f5b4151cc6e553d9847299c97f53640d94ba88b1c534e15cdc6ac38",
			
 
				-                "sha256:345c201324066b789804411f07eea750e9f29872be052eba221ce76add647d50",
			
 
				-                "sha256:360a771b538463053383fb6ff7aceffb595248d7059bb9e003bf70562a66510d",
			
 
				-                "sha256:432e98e6fe0d24e8181eb4177e59cba9f8831dcaf272a0d2de75bc8b933952a0",
			
 
				-                "sha256:4387ebd5ae8bc2c716dbfc1ece769c867307eeecc192e72a4d2e7fa0fc092646",
			
 
				-                "sha256:43fef20dd1024409375cc646a4b5afaffb62f6488e41588cde2a1ed2e9432b5b",
			
 
				-                "sha256:4d71ee83441826fb48771e58cef51191500a87734b4acb6b698ca018479395bd",
			
 
				-                "sha256:4eeff8b12c7d22be4de98721bba5a042875f4365e9fd20dc3916eec474ccb81e",
			
 
				-                "sha256:534c71caa87c7fdb136ce5073fb42b732a4eb390946f503d8e1d7ce6a4a79100",
			
 
				-                "sha256:66467123c220689d55c6d51fdf88f7b0b62b8078823c5f6c0297ab47c22003d7",
			
 
				-                "sha256:6c4af3aceeff6a0e2bd3657d8b25714a9f7c7c606e7ec52029284973094f84c1",
			
 
				-                "sha256:7d252dea33c1ee07b3d702fb4962963996ea40e5a2615dbe7646ccabd851ac76",
			
 
				-                "sha256:86a7321636f851c6e8009901c5d67e97d82b86ee8c6f28a476691c41c3d71a95",
			
 
				-                "sha256:88d75ea6b4330a6f5596a49904f21762ff89ca763db065d63b815ad8c3d68952",
			
 
				-                "sha256:8a296bbf367867aee2ea8d5b391cb04fbdb3ca7277cd1649d9e8114620f3b090",
			
 
				-                "sha256:933427a5474e014d01bac93224cd4e2bc7bbc7ce531d0bd7e55e4f940cc8ce0d",
			
 
				-                "sha256:93f6fe67a76d7fa1cca3b9febb36e9f2dd76055230e2bfa317969532f34c03ab",
			
 
				-                "sha256:a687e552ab4ffedcf3ec3bd5256ab3e753b4f605b467e9fa39690b2dadb5f607",
			
 
				-                "sha256:a69787f7fc87b84df7e2f27158476cdf39a79ebb95af1d6f696e474724af9ebe",
			
 
				-                "sha256:a76c10b467f7d385e4cffe2185d975336acf0dbf24ed702c46207df0fb64055e",
			
 
				-                "sha256:b093bd6efb49332021714bed5752e784a34ae6d6896ec56ffdc32cc83275a215",
			
 
				-                "sha256:bdeb300bb9adc02f98957cd0cf0c38d641bdd435b0927e39870a772e0a750bc0",
			
 
				-                "sha256:c719f0058951457a7761bb69c2e47781a9989ab4819b7a30b6b39141ad013a5f",
			
 
				-                "sha256:cadb58aeadd9916e79e8f99a49d0c0a9e61ae2b24469c2b304a0699e41a25e59",
			
 
				-                "sha256:cc3c0d87b11ae1dd1ccbd6fc7875a290b3f73b771254180c2e7b19c2aec7379b",
			
 
				-                "sha256:d42b8e2bffdf9e01d66cf46472b938493b854ea790a0fbe2e2e42624fc253b33",
			
 
				-                "sha256:d7684e0598acfbfb5110bea482d8c5e94f52001d6d66b5558177f41f49fb5930",
			
 
				-                "sha256:e5267cd2e51ddefbe10bb182c36ba41cdaa51c83a0fdfa63ed8cbe89cbcf0f33"
			
 
				+                "sha256:0140f6dac2659fa6783e7029085ab0447d8eb23cf4d831fb907588d27ba158f7",
			
 
				+                "sha256:034b42a6a59bf4ddc57e5a38a9dbac83ccd94c0b565ba91dba4ff58149706028",
			
 
				+                "sha256:03a503ecff0cc2be3ad4dafd220eaff13721edb11c191670b7662932fb0a5c3a",
			
 
				+                "sha256:069de3a701d33709236efe0d06f38846b738b19c63d45cc47f54590982ba7802",
			
 
				+                "sha256:1735e06a3d5b0793d5ee2d952df8a5c63edaff6383c2210c9b5c93dc2ea4c315",
			
 
				+                "sha256:19633df6be629200ff3c026f2837e1dd17908fb1bcea860290a5a45e6fa5148e",
			
 
				+                "sha256:1e14fa32969badef9c309f55352e5c46f321bd29f7c600556caacdaa3eddfcf6",
			
 
				+                "sha256:31e941d6db8b026bc63e46ef71e877913f128bd44260b90c645432626b7f9a47",
			
 
				+                "sha256:452c4e002be727cb6f929dbd32bbc666a0921b86555b8af09709060ed3954bd3",
			
 
				+                "sha256:45a720029756800628359192630fffdc9660ab6f27f0409bd24d9e09d75d6c18",
			
 
				+                "sha256:4a2e7f037d3ca818d6d0490e3323fd451545f580df30d62b698da2f247015a34",
			
 
				+                "sha256:4a7d4da2acf6d5d068fb41c48950827c49c3c68bfb46a1da45ea8fbf7ed4b471",
			
 
				+                "sha256:4ad4044eb86fbcbdff2106e44f479fbdac703d77860b3e19988c8a8786e73061",
			
 
				+                "sha256:4f631edf45a943738fa77612e85fc5c5d3fb637c4f5a530f7eedd1a7cd7a70a7",
			
 
				+                "sha256:6389b10e23329dc8b5600c1a84e3da2628d0f437d8a5cd05aefd1470ec571dd1",
			
 
				+                "sha256:6ebd58e73b7bd902688c0bb8dbabb0c36b756f02cc7b27ad5efa2f380c611f95",
			
 
				+                "sha256:7180830ea1082b96b94884bc352b274e29b45151b6ee911bf1fd79cba2de659b",
			
 
				+                "sha256:789be639501445d85fd4ca41d04f0f5c6cbb6deb0c6826aaa6f22774fe84ef94",
			
 
				+                "sha256:7d89add44938ea4f52c7641d5805c9e154fed4381e874ef3221483eeb191a96d",
			
 
				+                "sha256:842b0d4698381aac047f8ae57409c90b7e63ebabf5bc02814ddc8eaefd13499e",
			
 
				+                "sha256:8f96d4b6a49d3f0f109365bb6303ae5d266d3f90280ca68cf8b2c46032491038",
			
 
				+                "sha256:961b089e64c2ad29ad367487dd3ba1aa3eeba56bc82037ce91732baaa0f6ca90",
			
 
				+                "sha256:96de1d4a2e05d4a017087cb29cd6a8ebfeecfd0e9f872880b1a589f011c1c02e",
			
 
				+                "sha256:98214f04802a3fc740038744d8981a8f2fdca710f791ca125fc4792737d9f3a7",
			
 
				+                "sha256:9cf94161cb55507cee147bf8abcfd3c076b353ad18743296764dd81108ea74f8",
			
 
				+                "sha256:9fdf0713166f33e5e6ea98cf59deb305cb323131277f6880de6c509f468076f8",
			
 
				+                "sha256:a41ab83ecfadf38a47bdfaf4e488f71579df47a711e1ab1dce30d34c7c25bd00",
			
 
				+                "sha256:ac14fee167653ec6dee32d6aa4d501d90ae1bfbbc3eb5816940bccf227f0d617",
			
 
				+                "sha256:b8b7d66ee8b8ac272adce0af1342a60854f0d89686e6d3318127a6a82a2f765c",
			
 
				+                "sha256:bb1072fdf48ba870c0fe81bee8babe4ba2f096fb56bb4f3e0c2386a7626e405c",
			
 
				+                "sha256:cd823071b97c1a6ac3af9e43b5d861126a1304033dcd18dfe354a02ec45642fe",
			
 
				+                "sha256:d08173144aebdf30c21a331b532db16535cfa83deed12e8703fa6c67c0894ffc",
			
 
				+                "sha256:e7d76312e904aa4ea221a92c0bc2e299ad46e4580e2d72ca1f7e6d31dce5bfab",
			
 
				+                "sha256:f772e4428d413c0affe2a34836278fbe9df9a9c0940705860c2d3a4b50af1a66"
			
 
				             ],
			
 
				             "index": "pypi",
			
 
				-            "version": "==1.4.6"
			
 
				+            "version": "==1.4.11"
			
 
				         },
			
 
				         "sqlalchemy-utils": {
			
 
				             "hashes": [
			
 
				-                "sha256:fb66e9956e41340011b70b80f898fde6064ec1817af77199ee21ace71d7d6ab0"
			
 
				+                "sha256:c7bec2c982b31ec6133ba519f73f07653bbb7e7b3c23836bb8d9133045386b68"
			
 
				             ],
			
 
				-            "version": "==0.36.8"
			
 
				+            "version": "==0.37.0"
			
 
				         },
			
 
				         "sqlparse": {
			
 
				             "hashes": [
			
 
				                 "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
			
 
				                 "sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
			
 
				             ],
			
 
				-            "index": "pypi",
			
 
				+            "markers": "python_version >= '3.5'",
			
 
				             "version": "==0.4.1"
			
 
				         },
			
 
				         "threadpoolctl": {
			
@@ -581,15 +564,6 @@
 
				             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
			
 
				             "version": "==4.60.0"
			
 
				         },
			
 
				-        "typing-extensions": {
			
 
				-            "hashes": [
			
 
				-                "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
			
 
				-                "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
			
 
				-                "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
			
 
				-            ],
			
 
				-            "markers": "python_version < '3.8'",
			
 
				-            "version": "==3.7.4.3"
			
 
				-        },
			
 
				         "urllib3": {
			
 
				             "hashes": [
			
 
				                 "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df",
			
@@ -597,14 +571,6 @@
 
				             ],
			
 
				             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
			
 
				             "version": "==1.26.4"
			
 
				-        },
			
 
				-        "zipp": {
			
 
				-            "hashes": [
			
 
				-                "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
			
 
				-                "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
			
 
				-            ],
			
 
				-            "markers": "python_version >= '3.6'",
			
 
				-            "version": "==3.4.1"
			
 
				         }
			
 
				     },
			
 
				     "develop": {}
			
--- a/cdplib/db_handlers/InfluxdbHandler.py
+++ b/cdplib/db_handlers/InfluxdbHandler.py
@@ -82,8 +82,15 @@ class InfluxdbHandler:
 
				         try:
			
 
				             # result of the query is a defaultdict
			
 
				             result = self.client.query(query)
			
 
				-
			
 
				-            return list(result.values())[0]
			
 
				+            
			
 
				+            if len(list(result.values())) > 0:
			
 
				+
			
 
				+                return list(result.values())[0]
			
 
				+            
			
 
				+            else:
			
 
				+                
			
 
				+                return pd.DataFrame()
			
 
				+            
			
 
				         except Exception as e:
			
 
				             self._logger.log_and_raise_error(
			
 
				                 ("Could not query to dataframe. "
			
@@ -91,8 +98,8 @@ class InfluxdbHandler:
 
				 
			
 
				     def query_between_dates(self, columns: str,
			
 
				                             tables: str,
			
 
				-                            start: str,
			
 
				-                            stop: str) -> pd.DataFrame:
			
 
				+                            start: str = None,
			
 
				+                            stop: str = None) -> pd.DataFrame:
			
 
				         """
			
 
				         :param columns: DESCRIPTION
			
 
				         :type columns: str
			
@@ -106,21 +113,35 @@ class InfluxdbHandler:
 
				         :rtype: TYPE
			
 
				 
			
 
				         """
			
 
				-        if not isinstance(start, str):
			
 
				+        if (start is not None) and (not isinstance(start, str)):
			
 
				             start = datetime.strftime(start, format="%Y-%m-%dT%H:%M:%SZ")
			
 
				 
			
 
				-        if not isinstance(stop, str):
			
 
				+        if (stop is not None) and (not isinstance(stop, str)):
			
 
				             stop = datetime.strftime(stop, format="%Y-%m-%dT%H:%M:%SZ")
			
 
				-
			
 
				-        query = 'SELECT ' +\
			
 
				-                columns +\
			
 
				-                ' FROM \"' +\
			
 
				-                tables +\
			
 
				-                '\" WHERE time > \'' +\
			
 
				+            
			
 
				+        query = 'SELECT ' + columns + ' FROM \"' + tables
			
 
				+            
			
 
				+        if (start is not None) and (stop is not None):
			
 
				+            
			
 
				+             query += '\" WHERE time > \'' +\
			
 
				                 str(start) +\
			
 
				                 '\' AND time  < \'' +\
			
 
				                 str(stop) +\
			
 
				                 '\' tz(\'Europe/Berlin\');'
			
 
				+                
			
 
				+        elif start is not None:
			
 
				+            
			
 
				+            query += '\" WHERE time >= \'' + str(start) +\
			
 
				+                '\' tz(\'Europe/Berlin\');'
			
 
				+            
			
 
				+        elif stop is not None:
			
 
				+            
			
 
				+            query += '\" WHERE time <= \'' + str(stop) +\
			
 
				+                '\' tz(\'Europe/Berlin\');'
			
 
				+                
			
 
				+        else:
			
 
				+            query += ';'
			
 
				+            
			
 
				 
			
 
				         return self.query_to_dataframe(query)
			
 
				 
			
@@ -147,12 +168,12 @@ class InfluxdbHandler:
 
				         """
			
 
				         
			
 
				         measurement_columns = [c for c in dataframe.columns
			
 
				-                               if c not in tag_columns]
			
 
				+                               if c not in (tag_columns or [])]
			
 
				         
			
 
				         for column in measurement_columns:
			
 
				             try:
			
 
				                 self.client.write_points(
			
 
				-                    dataframe=dataframe[[column] + tag_columns],
			
 
				+                    dataframe=dataframe[[column] + (tag_columns or [])],
			
 
				                     measurement=column,
			
 
				                     tag_columns=tag_columns,
			
 
				                     protocol='line',
			
--- a/cdplib/fine_tuning/FineTunedClassiferCV.py
+++ b/cdplib/fine_tuning/FineTunedClassiferCV.py
@@ -0,0 +1,173 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Apr 23 08:51:53 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+
			
 
				+@description: class for fine-tuning a sklearn classifier
			
 
				+(optimizing the probability threshold)
			
 
				+"""
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from typing import Callable
			
 
				+
			
 
				+from sklearn.base import (BaseEstimator, ClassifierMixin,
			
 
				+                          clone, MetaEstimatorMixin)
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+from cdplib.utils.TyperConverter import TypeConverter
			
 
				+
			
 
				+
			
 
				+class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
			
 
				+                            MetaEstimatorMixin):
			
 
				+    """
			
 
				+    Probability threshold tuning for a given estimator.
			
 
				+    Overrides the method predict of the given sklearn classifer
			
 
				+    and returns predictions with the optimal value of
			
 
				+    the probability threshold.
			
 
				+
			
 
				+    An object of this class can be passed to an sklearn Pipeline
			
 
				+    """
			
 
				+    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
			
 
				+                 cv=None, threshold_step: float = 0.1):
			
 
				+        """
			
 
				+        """
			
 
				+        self.estimator = estimator
			
 
				+
			
 
				+        self.is_fitted = False
			
 
				+
			
 
				+        self.greater_is_better = greater_is_better
			
 
				+
			
 
				+        if cv is None:
			
 
				+            self.cv = ...
			
 
				+        else:
			
 
				+            self.cv = cv
			
 
				+
			
 
				+        self.cost_func = cost_func
			
 
				+
			
 
				+        self.threshold_step = threshold_step
			
 
				+
			
 
				+        self.optimal_threshold = 0.5
			
 
				+
			
 
				+        self._logger = Log("FineTunedClassifyCV")
			
 
				+
			
 
				+    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
			
 
				+                            proba_pred: (pd.DataFrame, np.array)):
			
 
				+        '''
			
 
				+        '''
			
 
				+        costs = {}
			
 
				+
			
 
				+        for t in np.arange(self.threshold_step, 1, self.threshold_step):
			
 
				+            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
			
 
				+
			
 
				+        if self.greater_is_better:
			
 
				+            return max(costs, key=costs.get)
			
 
				+        else:
			
 
				+            return min(costs, key=costs.get)
			
 
				+
			
 
				+    def fit(self, X: (pd.DataFrame, np.array),
			
 
				+            y: (pd.DataFrame, np.array) = None,
			
 
				+            **fit_args):
			
 
				+        """
			
 
				+        """
			
 
				+        X = TypeConverter().convert_to_ndarray(X)
			
 
				+        if y is not None:
			
 
				+            y = TypeConverter().convert_to_ndarray(X)
			
 
				+
			
 
				+        optimal_thrs_per_fold = []
			
 
				+
			
 
				+        for train_inds, val_inds in self.cv:
			
 
				+            X_train, X_val = X[train_inds], X[val_inds]
			
 
				+
			
 
				+            if y is not None:
			
 
				+                y_train, y_val = y[train_inds], y[val_inds]
			
 
				+            else:
			
 
				+                y_train, y_val = None, None
			
 
				+
			
 
				+            estimator = clone(fine_tuned_clf.estimator)
			
 
				+
			
 
				+            estimator.fit(X_train, y_train, **fit_args)
			
 
				+
			
 
				+            proba_pred = estimator.predict_proba(X_val)
			
 
				+
			
 
				+            optimal_thr = self._get_best_threshold(y_val, proba_pred)
			
 
				+
			
 
				+            optimal_thrs_per_fold.append(optimal_thr)
			
 
				+
			
 
				+        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
			
 
				+
			
 
				+        self.estimator.fit(X, **fit_args)
			
 
				+
			
 
				+    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
			
 
				+        """
			
 
				+        """
			
 
				+        if self.is_fitted:
			
 
				+
			
 
				+            proba_pred = self.estimator.predict_proba(X)
			
 
				+
			
 
				+            return (proba_pred >= self.optimal_threshold).astype(int)
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.warn("You should fit first")
			
 
				+
			
 
				+    def get_params(self):
			
 
				+        """
			
 
				+        """
			
 
				+        params = self.estimator.get_params()
			
 
				+
			
 
				+        params.update({"cv": self.cv, "cost_func": self.cost_func})
			
 
				+
			
 
				+        return params
			
 
				+
			
 
				+    def set_params(self, **params: dict):
			
 
				+        """
			
 
				+        """
			
 
				+        for param in params:
			
 
				+            if param == "cv":
			
 
				+                self.cv = params[param]
			
 
				+                params.pop(param)
			
 
				+
			
 
				+            elif param == "cost_func":
			
 
				+                self.cost_func = params[param]
			
 
				+                params.pop(param)
			
 
				+
			
 
				+        self.estimator.set_params(**params)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # test
			
 
				+    from sklearn.datasets import load_iris
			
 
				+    from sklearn.metrics import accuracy_score
			
 
				+    import gc
			
 
				+    from xgboost import XGBRFClassifier
			
 
				+
			
 
				+    data = load_iris()
			
 
				+    X, y = data["data"], data["target"]
			
 
				+    y = (y==1).astype(int)
			
 
				+    del data
			
 
				+    gc.collect()
			
 
				+
			
 
				+    # make a custom cv object
			
 
				+    val_len = len(X)//10
			
 
				+    split_inds = range(len(X)//2, len(X), val_len)
			
 
				+
			
 
				+    cv = []
			
 
				+
			
 
				+    for i in split_inds:
			
 
				+        train_inds = list(range(i))
			
 
				+        val_inds = list(range(i, i + val_len))
			
 
				+        cv.append((train_inds, val_inds))
			
 
				+
			
 
				+    clf = XGBRFClassifier()
			
 
				+
			
 
				+    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
			
 
				+                                           cv=cv,
			
 
				+                                           greater_is_better=True,
			
 
				+                                           cost_func=accuracy_score)
			
 
				+
			
 
				+    fine_tuned_clf.fit(X=X, y=y)
			
 
				+
			
--- a/cdplib/fine_tuning/__init__.py
+++ b/cdplib/fine_tuning/__init__.py
@@ -0,0 +1 @@
 
				+from .FineTunedClassiferCV import *
			
--- a/cdplib/gridsearch/GridSearchPipelineSelector.py
+++ b/cdplib/gridsearch/GridSearchPipelineSelector.py
@@ -0,0 +1,384 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 14:15:17 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description:a class for selecting a machine learning
			
 
				+ pipeline from a deterministic space of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is though in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far as well as the entire tuning history
			
 
				+ if needed.
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+from copy import deepcopy
			
 
				+from itertools import product
			
 
				+from collections import ChainMap
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from typing import Callable, Optional, Literal, Dict, Union, List
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
			
 
				+
			
 
				+
			
 
				+class GridSearchPipelineSelector(PipelineSelector):
			
 
				+    """
			
 
				+    A class for selecting a machine learning
			
 
				+     pipeline from a deterministic space of parameter distributions
			
 
				+     over multiple pipelines.
			
 
				+     The selection is though in such a way that a Trials object is being
			
 
				+     maintained during the tuning process from which one can retrieve
			
 
				+     the best pipeline so far as well as the entire tuning history
			
 
				+     if needed.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func: Union[Callable, str],
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: Optional[int] = None,
			
 
				+                 cross_validation_needs_scorer: bool = True,
			
 
				+                 cross_val_averaging_func: Callable = np.mean,
			
 
				+                 additional_metrics: Optional[Dict[str, Callable]] = None,
			
 
				+                 strategy_name: Optional[str] = None,
			
 
				+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
			
 
				+                 = "INFO"
			
 
				+                 ):
			
 
				+        """
			
 
				+        ::param Callable cost_func: function to minimize or maximize
			
 
				+            over the elements of a given (pipeline/hyperparameter) space
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from scratch.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				+
			
 
				+        :param Callable cross_val_averaging_func: Function to aggregate
			
 
				+            the cross-validation scores.
			
 
				+            Example different from the mean: mean - c*var.
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to save
			
 
				+            of the form {"metric_name": metric} where metric is a Callable.
			
 
				+
			
 
				+        :param str strategy_name:
			
 
				+            a strategy is defined by the data set (columns/features and rows),
			
 
				+            cv object, cost function.
			
 
				+            When the strategy changes, one must start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+        try:
			
 
				+
			
 
				+            super().__init__(cost_func=cost_func,
			
 
				+                             greater_is_better=greater_is_better,
			
 
				+                             trials_path=trials_path,
			
 
				+                             backup_trials_freq=backup_trials_freq,
			
 
				+                             cross_validation_needs_scorer=
			
 
				+                                 cross_validation_needs_scorer,
			
 
				+                             cross_val_averaging_func=cross_val_averaging_func,
			
 
				+                             additional_metrics=additional_metrics,
			
 
				+                             strategy_name=strategy_name,
			
 
				+                             stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._logger = Log("GridsearchPipelineSelector: ",
			
 
				+                               stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._trials = self._trials or []
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed initialization. Exit with error: {}".format(e)
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def run_trials(self) -> None:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self.attached_space),\
			
 
				+                "Parameter distribution space must be attached"
			
 
				+                
			
 
				+            # XXX Tanya: if the list of values is empty
			
 
				+            # in the space element, remove it
			
 
				+
			
 
				+            done_trial_ids = [{"name": trial["name"],
			
 
				+                               "params": trial["params"],
			
 
				+                               "status": trial["status"]}
			
 
				+                              for trial in self._trials]
			
 
				+
			
 
				+            # list (generator) of (flattened) dictionaries
			
 
				+            # with all different combinations of
			
 
				+            # parameters for different pipelines
			
 
				+            # from the space definition.
			
 
				+            space_unfolded = ({"name": param_dist["name"],
			
 
				+                               "pipeline": param_dist["pipeline"],
			
 
				+                               "params": param_set}
			
 
				+                              for param_dist in self._space
			
 
				+                              for param_set in
			
 
				+                              (dict(ChainMap(*tup)) for tup in
			
 
				+                               product(*[[{k: v} for v in
			
 
				+                                          param_dist["params"][k]]
			
 
				+                                         for k in param_dist["params"]])))
			
 
				+
			
 
				+            for space_element in space_unfolded:
			
 
				+
			
 
				+                # uniquely identifies the current space element
			
 
				+                trial_id = {"name": space_element["name"],
			
 
				+                            "params": space_element["params"],
			
 
				+                            "status": 'ok'}
			
 
				+
			
 
				+                # verify if the current pipline/parameters
			
 
				+                # were already tested before
			
 
				+                if trial_id in done_trial_ids:
			
 
				+                    continue
			
 
				+
			
 
				+                result = self._objective(space_element)
			
 
				+
			
 
				+                pipeline = deepcopy(space_element["pipeline"])
			
 
				+                
			
 
				+                pipeline = pipeline.set_params(**space_element["params"])
			
 
				+
			
 
				+                trial = {"name": space_element["name"],
			
 
				+                         "params": space_element["params"],
			
 
				+                         "pipeline": pipeline}
			
 
				+
			
 
				+                trial.update(result)
			
 
				+
			
 
				+                self._trials.append(trial)
			
 
				+
			
 
				+            self.finished_tuning = True
			
 
				+
			
 
				+            self.total_tuning_time = datetime.datetime.today()\
			
 
				+                - self.start_tuning_time
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to run trials. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def number_of_trials(self) -> Union[int, None]:
			
 
				+        """
			
 
				+        Number of trials already run in the current trials object
			
 
				+        """
			
 
				+        try:
			
 
				+            return len(self._trials)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve the number of trials. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial(self) -> Union[dict, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return max(self._trials, key=lambda x: x["score"])
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> Union[float, None]:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return self.best_trial["score"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> Union[float, None]:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return self.best_trial["score_variance"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
			
 
				+        '''
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return self.best_trial["pipeline"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not retrieve the best trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int)\
			
 
				+            -> Union[List[Pipeline], None]:
			
 
				+        """
			
 
				+        N best pipelines with corresponding
			
 
				+        best hyperparameters
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return [trial["pipeline"] for trial in
			
 
				+                    sorted(self._trials, key=lambda x: x["score"],
			
 
				+                           reverse=True)[:n]]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best trials. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
			
 
				+            -> Union[Dict[str, List[Pipeline]], None]:
			
 
				+        """
			
 
				+        If the hyperparameter search is done over multiple
			
 
				+        pipelines, then returns n different pipeline-types
			
 
				+        with corresponding hyperparameters
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials) > 0),\
			
 
				+                ("Trials object is empty. "
			
 
				+                 "Call run_trials method.")
			
 
				+
			
 
				+            return pd.DataFrame(self._trials)\
			
 
				+                     .sort_values(by=["name", "score"],
			
 
				+                                  ascending=False)\
			
 
				+                     .groupby("name")\
			
 
				+                     .head(n)\
			
 
				+                     .groupby("name")["pipeline"]\
			
 
				+                     .apply(lambda x: list(x))\
			
 
				+                     .to_dict()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best trials of each type."
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def trials_to_excel(self, path: str) -> None:
			
 
				+        """
			
 
				+        Trials object in the shape of table written to excel,
			
 
				+        should contain the run number, pipeline (as str),
			
 
				+        hyperparamters (as str), self.best_result (see self._objective method)
			
 
				+        as well as additional information configured
			
 
				+        through self.save_result method.
			
 
				+        """
			
 
				+        try:
			
 
				+            pd.DataFrame(self._trials).to_excel(path)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to write trials to excel. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    # elementary example
			
 
				+
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from sklearn.metrics import accuracy_score, precision_score
			
 
				+    from cdplib.gridsearch.space_sample import space
			
 
				+    from cdplib.db_handlers import MongodbHandler
			
 
				+    import pickle
			
 
				+    import pandas as pd
			
 
				+
			
 
				+    trials_path = "gridsearch_trials_TEST.pkl"
			
 
				+    additional_metrics = {"precision": precision_score}
			
 
				+    strategy_name = "strategy_1"
			
 
				+    data_path = "data_TEST.h5"
			
 
				+    cv_path = "cv_TEST.pkl"
			
 
				+    collection_name = 'TEST_' + strategy_name
			
 
				+
			
 
				+    logger = Log("GridSearchPipelineSelector__TEST:")
			
 
				+
			
 
				+    logger.info("Start test")
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
			
 
				+    pd.Series(y).to_hdf(data_path, key="y_train")
			
 
				+
			
 
				+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
			
 
				+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
			
 
				+
			
 
				+    pickle.dump(cv, open(cv_path, "wb"))
			
 
				+
			
 
				+    gs = GridSearchPipelineSelector(cost_func=accuracy_score,
			
 
				+                                    greater_is_better=True,
			
 
				+                                    trials_path=trials_path,
			
 
				+                                    additional_metrics=additional_metrics,
			
 
				+                                    strategy_name=strategy_name,
			
 
				+                                    stdout_log_level="WARNING")
			
 
				+
			
 
				+    gs.attach_space(space=space)
			
 
				+
			
 
				+    gs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
			
 
				+                             cv_pickle_path=cv_path)
			
 
				+
			
 
				+    save_method = MongodbHandler().insert_data_into_collection
			
 
				+    save_kwargs = {'collection_name': collection_name}
			
 
				+
			
 
				+    gs.configer_summary_saving(save_method=save_method,
			
 
				+                               kwargs=save_kwargs)
			
 
				+
			
 
				+    gs.run_trials()
			
 
				+
			
 
				+    logger.info("Best trial: {}".format(gs.best_trial))
			
 
				+    logger.info("Total tuning time: {}".format(gs.total_tuning_time))
			
 
				+
			
 
				+    for file in [trials_path, data_path, cv_path]:
			
 
				+        os.remove(file)
			
 
				+
			
 
				+    logger.info("End test")
			
 
				+    
			
 
				+    # XXX Tanya check warnings
			
--- a/cdplib/gridsearch/__init__.py
+++ b/cdplib/gridsearch/__init__.py
@@ -0,0 +1,2 @@
 
				+from .GridSearchPipelineSelector import *
			
 
				+from .space_sample import *
			
--- a/cdplib/gridsearch/space_sample.py
+++ b/cdplib/gridsearch/space_sample.py
@@ -0,0 +1,33 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Oct  5 09:50:24 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.feature_selection import SelectPercentile
			
 
				+from sklearn.linear_model import LogisticRegression
			
 
				+from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+
			
 
				+
			
 
				+space = [
			
 
				+        {"name": "std_scaler_kbest_rf",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("kbest", SelectPercentile()),
			
 
				+                 ("rf", RandomForestClassifier())]),
			
 
				+         "params": {"kbest__percentile": [2, 3],
			
 
				+                    "rf__n_estimators": [10, 20]}},
			
 
				+
			
 
				+        {"name": "std_scaler_pca_lr",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("pca", PCA()),
			
 
				+                 ("lr", LogisticRegression())]),
			
 
				+         "params": {"lr__C": [0.5, 1],
			
 
				+                    "pca__n_components": [2, 3]}}
			
 
				+        ]
			
--- a/cdplib/hyperopt/HyperoptPipelineSelector.py
+++ b/cdplib/hyperopt/HyperoptPipelineSelector.py
@@ -0,0 +1,499 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Tue Oct  6 15:04:25 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description:a class for selecting a machine learning
			
 
				+ pipeline from a deterministic space of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is though in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far as well as the entire tuning history
			
 
				+ if needed.
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+
			
 
				+import pickle
			
 
				+
			
 
				+from copy import deepcopy
			
 
				+
			
 
				+import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+
			
 
				+from hyperopt import fmin, tpe, rand, Trials, space_eval
			
 
				+
			
 
				+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector,\
			
 
				+     SpaceElementType
			
 
				+
			
 
				+from typing import Callable, Optional, Literal, Dict, Union, List
			
 
				+
			
 
				+
			
 
				+class HyperoptPipelineSelector(PipelineSelector):
			
 
				+    """
			
 
				+    Use this class to perform a search
			
 
				+    for a machine learning pipeline in a given parameter space.
			
 
				+    The parameter space can include multiple types of Pipelines
			
 
				+    (SVM, XGBOOST, random forest, etc),
			
 
				+    as well as parameter distributions for each pipeline parameter.
			
 
				+    See example in main for the expected space structure.
			
 
				+
			
 
				+    The search can be performed either randomly
			
 
				+    or with a tree-based algorithm. (Other methods are currently
			
 
				+    developped by hyperopt creators).
			
 
				+
			
 
				+    Attribute trials is responsible for book-keeping parameter
			
 
				+    combinations that have already been tried out. This attribute
			
 
				+    is saved to a binary file every n minutes as well as every time
			
 
				+    a better pipeline was found.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func: Union[Callable, str],
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: Optional[int] = None,
			
 
				+                 cross_validation_needs_scorer: bool = True,
			
 
				+                 cross_val_averaging_func: Callable = np.mean,
			
 
				+                 additional_metrics: Optional[Dict[str, Callable]] = None,
			
 
				+                 strategy_name: Optional[str] = None,
			
 
				+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
			
 
				+                 = "INFO"):
			
 
				+        """
			
 
				+        param Callable cost_func: function to minimize or maximize
			
 
				+            over the elements of a given (pipeline/hyperparameter) space
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from scratch.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				+
			
 
				+        :param Callable cross_val_averaging_func: Function to aggregate
			
 
				+            the cross-validation scores.
			
 
				+            Example different from the mean: mean - c*var.
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to save
			
 
				+            of the form {"metric_name": metric} where metric is a Callable.
			
 
				+
			
 
				+        :param str strategy_name:
			
 
				+            a strategy is defined by the data set (columns/features and rows),
			
 
				+            cv object, cost function.
			
 
				+            When the strategy changes, one must start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            super().__init__(cost_func=cost_func,
			
 
				+                             greater_is_better=greater_is_better,
			
 
				+                             trials_path=trials_path,
			
 
				+                             backup_trials_freq=backup_trials_freq,
			
 
				+                             cross_validation_needs_scorer=
			
 
				+                                 cross_validation_needs_scorer,
			
 
				+                             cross_val_averaging_func=cross_val_averaging_func,
			
 
				+                             additional_metrics=additional_metrics,
			
 
				+                             strategy_name=strategy_name,
			
 
				+                             stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._logger = Log("HyperoptPipelineSelector: ",
			
 
				+                               stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+            self._trials = self._trials or Trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to intialize. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def run_trials(self,
			
 
				+                   niter: int,
			
 
				+                   algo: Literal[tpe.suggest, rand.suggest] = tpe.suggest)\
			
 
				+            -> None:
			
 
				+        '''
			
 
				+        Method performing the search of the best pipeline in the given space.
			
 
				+        Calls fmin function from the hyperopt library to minimize the output of
			
 
				+        _objective.
			
 
				+
			
 
				+        :params int niter: number of search iterations
			
 
				+        :param algo: now can only take supported by the hyperopt library.
			
 
				+            For now these are tpe.suggest for a tree-based bayesian search
			
 
				+            or rad.suggest for randomized search
			
 
				+        '''
			
 
				+        try:
			
 
				+            self._trials = self._trials or Trials()
			
 
				+
			
 
				+            self._logger.info(("Starting {0} iterations of search "
			
 
				+                               "additional to {1} previous"
			
 
				+                               .format(niter, len(self._trials.trials))))
			
 
				+
			
 
				+            best_trial = fmin(fn=self._objective,
			
 
				+                              space=self._space,
			
 
				+                              algo=algo,
			
 
				+                              trials=self._trials,
			
 
				+                              max_evals=len(self._trials.trials) + niter)
			
 
				+
			
 
				+            self._logger.info(
			
 
				+                    "Best score is {0} with variance {1}"
			
 
				+                    .format(
			
 
				+                     self._trials.best_trial["result"]["score"],
			
 
				+                     self._trials.best_trial["result"]["score_variance"]))
			
 
				+
			
 
				+            self._logger.info(("Finished {0} iterations of search.\n"
			
 
				+                               "Best parameters are:\n {1} ")
			
 
				+                              .format(niter,
			
 
				+                                      space_eval(self._space, best_trial)))
			
 
				+
			
 
				+            self.finished_tuning = True
			
 
				+
			
 
				+            self.total_tuning_time = datetime.datetime.today()\
			
 
				+                - self.start_tuning_time
			
 
				+
			
 
				+            self._backup_trials()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to select best "
			
 
				+                   "pipeline! Exit with error: {}").format(e)
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def number_of_trials(self) -> Union[int, None]:
			
 
				+        """
			
 
				+        :return: number of trials run so far
			
 
				+            with the given Trials object
			
 
				+        """
			
 
				+
			
 
				+        try:
			
 
				+            return len(self._trials.trials)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve the number of trials. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _get_space_element_from_trial(self, trial: dict)\
			
 
				+            -> Union[Dict[str, SpaceElementType], None]:
			
 
				+        """
			
 
				+        Hyperopt trials object does not contain the space
			
 
				+             elements that result in the corresponding trials.
			
 
				+             One has to use the function space_eval from
			
 
				+             hyperopt to get the space element.
			
 
				+
			
 
				+        After retrieving the space element,
			
 
				+            parameters of the pipeline are set.
			
 
				+        """
			
 
				+        try:
			
 
				+            trial = deepcopy(trial)
			
 
				+
			
 
				+            assert(self.attached_space),\
			
 
				+                "Hyperparameter space not attached."
			
 
				+
			
 
				+            space_element = space_eval(self._space,
			
 
				+                                       {k: v[0] for k, v in
			
 
				+                                        trial['misc']['vals'].items()
			
 
				+                                        if len(v) > 0})
			
 
				+
			
 
				+            pipeline = deepcopy(space_element["pipeline"])
			
 
				+            params = deepcopy(space_element["params"])
			
 
				+            pipeline.set_params(**params)
			
 
				+
			
 
				+            space_element["pipeline"] = pipeline
			
 
				+
			
 
				+            return space_element
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve a space element from a trial. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _get_space_element_from_index(self, i: int)\
			
 
				+            -> Union[Dict[str, SpaceElementType], None]:
			
 
				+        """
			
 
				+        Gets the space element of shape
			
 
				+        {"name": NAME, "params": PARAMS, "pipeline": PIPELINE}
			
 
				+        from the trial number i.
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(len(self._trials.trials) > i),\
			
 
				+                ("Trials object is not long enough "
			
 
				+                 "to retrieve index {}".format(i))
			
 
				+
			
 
				+            return self._get_space_element_from_trial(self._trials.trials[i])
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to get space element from index. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _get_pipeline_from_index(self, i: int) -> Union[Pipeline, None]:
			
 
				+        """
			
 
				+        Gets a pipeline with set parameters from the trial number i
			
 
				+        """
			
 
				+        try:
			
 
				+            space_element = self._get_space_element_from_index(i)
			
 
				+
			
 
				+            return space_element["pipeline"]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve pipeline from index. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial(self) -> Union[dict, None]:
			
 
				+        """
			
 
				+        :return: dictionary with the summary of the best trial
			
 
				+            and space element (name, pipeline, params)
			
 
				+            resulting in the best trial
			
 
				+        """
			
 
				+        if len(self._trials.trials) == 0:
			
 
				+
			
 
				+            self._logger.log_and_throw_warning("Trials object is empty")
			
 
				+            return {}
			
 
				+
			
 
				+        else:
			
 
				+
			
 
				+            try:
			
 
				+                best_trial = deepcopy(self._trials.best_trial)
			
 
				+
			
 
				+                if self.attached_space:
			
 
				+
			
 
				+                    space_element = self._get_space_element_from_trial(
			
 
				+                            best_trial)
			
 
				+                else:
			
 
				+                    space_element = {}
			
 
				+
			
 
				+                    warn = ("Space is not attached, "
			
 
				+                            "To included the best pipeline "
			
 
				+                            "attach the space")
			
 
				+                    self._logger.log_and_throw_warning(warn)
			
 
				+
			
 
				+                best_trial = deepcopy(self._trials.best_trial["result"])
			
 
				+
			
 
				+                best_trial.update(space_element)
			
 
				+
			
 
				+                return best_trial
			
 
				+
			
 
				+            except Exception as e:
			
 
				+                err = "Failed to retrieve best trial. Exit with error: {}"\
			
 
				+                    .format(e)
			
 
				+
			
 
				+                self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score(self) -> Union[float, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self.best_trial) > 0:
			
 
				+                return self.best_trial["score"]
			
 
				+            else:
			
 
				+                return np.nan
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve best trial score. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_score_variance(self) -> Union[float, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self.best_trial) > 0:
			
 
				+                return self.best_trial["score_variance"]
			
 
				+            else:
			
 
				+                return np.nan
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve best trial score variance. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self.best_trial) > 0:
			
 
				+                return self.best_trial["pipeline"]
			
 
				+            else:
			
 
				+                return np.nan
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve best trial pipeline. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines(self, n: int)\
			
 
				+            -> Union[List[Pipeline], None]:
			
 
				+        """
			
 
				+        :return: the list of n best pipelines
			
 
				+        documented in trials
			
 
				+        """
			
 
				+        try:
			
 
				+            if len(self._trials.trials) == 0:
			
 
				+                return []
			
 
				+            else:
			
 
				+                n_best_trials = sorted(self._trials.trials,
			
 
				+                                       key=lambda x: x["result"]["score"],
			
 
				+                                       reverse=True)[:n]
			
 
				+
			
 
				+                return [self._get_space_element_from_trial(trial)["pipeline"]
			
 
				+                        for trial in n_best_trials]
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to retrieve n best pipelines. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
			
 
				+            -> Union[Dict[str, List[Pipeline]], None]:
			
 
				+        """
			
 
				+        :return: a dictiionry where keys are pipeline names,
			
 
				+        and values are lists of best pipelines with this name
			
 
				+        """
			
 
				+        try:
			
 
				+            scores = [trial["result"]["score"]
			
 
				+                      for trial in self._trials.trials]
			
 
				+
			
 
				+            names = [self._get_space_element_from_trial(trial)["name"]
			
 
				+                     for trial in self._trials.trials]
			
 
				+
			
 
				+            return pd.DataFrame({"name": names, "score": scores})\
			
 
				+                     .sort_values(by=["name", "score"], ascending=False)\
			
 
				+                     .groupby("name")\
			
 
				+                     .head(n)\
			
 
				+                     .reset_index()\
			
 
				+                     .assign(pipeline=lambda x: x["index"]
			
 
				+                             .apply(self._get_pipeline_from_index))\
			
 
				+                     .groupby("name")["pipeline"]\
			
 
				+                     .apply(lambda x: list(x))\
			
 
				+                     .to_dict()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to get n best pipelines of each type. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def trials_to_excel(self, path: str = None) -> None:
			
 
				+        """
			
 
				+        Saves an excel file with pipeline names, scores,
			
 
				+        parameters, and timestamps.
			
 
				+        """
			
 
				+        try:
			
 
				+            results = [trial["result"] for trial in self._trials.trials]
			
 
				+
			
 
				+            space_elements = [self._get_space_element_from_trial(trial)
			
 
				+                              for trial in self._trials.trials]
			
 
				+
			
 
				+            pd.DataFrame([{**result, **space_element}
			
 
				+                          for result, space_element in
			
 
				+                          zip(results, space_elements)]).to_excel(path)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to write trials to excel. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    # elementary example
			
 
				+
			
 
				+    from sklearn.metrics import roc_auc_score, precision_score
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from cdplib.log import Log
			
 
				+    from cdplib.db_handlers import MongodbHandler
			
 
				+    from cdplib.hyperopt.space_sample import space
			
 
				+    # from cdplib.hyperopt.composed_space_sample import space
			
 
				+
			
 
				+    trials_path = "hyperopt_trials_TEST.pkl"
			
 
				+    additional_metrics = {"precision": precision_score}
			
 
				+    strategy_name = "strategy_1"
			
 
				+    data_path = "data_TEST.h5"
			
 
				+    cv_path = "cv_TEST.pkl"
			
 
				+    collection_name = 'TEST_' + strategy_name
			
 
				+
			
 
				+    logger = Log("HyperoptPipelineSelector__TEST:")
			
 
				+
			
 
				+    logger.info("Start test")
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
			
 
				+    pd.Series(y).to_hdf(data_path, key="y_train")
			
 
				+
			
 
				+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
			
 
				+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
			
 
				+
			
 
				+    pickle.dump(cv, open(cv_path, "wb"))
			
 
				+
			
 
				+    hs = HyperoptPipelineSelector(cost_func=roc_auc_score,
			
 
				+                                  greater_is_better=True,
			
 
				+                                  trials_path=trials_path,
			
 
				+                                  additional_metrics=additional_metrics,
			
 
				+                                  strategy_name=strategy_name,
			
 
				+                                  stdout_log_level="WARNING")
			
 
				+
			
 
				+    hs.attach_space(space=space)
			
 
				+
			
 
				+    hs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
			
 
				+                             cv_pickle_path=cv_path)
			
 
				+
			
 
				+    try:
			
 
				+
			
 
				+        # TODO: this line causes a pytype to throw not-callable error
			
 
				+        # works fine with pytype on other class methods.
			
 
				+        save_method = MongodbHandler().insert_data_into_collection
			
 
				+        save_kwargs = {'collection_name': collection_name}
			
 
				+
			
 
				+        # save_method = pd.DataFrame.to_excel()
			
 
				+        # save_kwargs = {'excel_writer': "TEST.xlsx"}
			
 
				+
			
 
				+        hs.configer_summary_saving(save_method=save_method,
			
 
				+                                   kwargs=save_kwargs)
			
 
				+
			
 
				+        logger.info("Configured summary saving in mongo")
			
 
				+
			
 
				+    except Exception as e:
			
 
				+
			
 
				+        logger.warning(("Could not configure summary saving in mongo. "
			
 
				+                        "Exit with error: {}".format(e)))
			
 
				+
			
 
				+    hs.run_trials(niter=10)
			
 
				+
			
 
				+    logger.info("Best Trial: {}".format(hs.best_trial))
			
 
				+    logger.info("Total tuning time: {}".format(hs.total_tuning_time))
			
 
				+
			
 
				+    for file in [trials_path, data_path, cv_path]:
			
 
				+        os.remove(file)
			
 
				+
			
 
				+    logger.info("End test")
			
--- a/cdplib/hyperopt/__init__.py
+++ b/cdplib/hyperopt/__init__.py
@@ -1 +1,4 @@
 
				 from .HyperoptPipelineSelection import *
			
 
				+from .HyperoptPipelineSelector import *
			
 
				+from .composed_space_sample import *
			
 
				+from .space_sample import *
			
--- a/cdplib/hyperopt/composed_space_sample.py
+++ b/cdplib/hyperopt/composed_space_sample.py
@@ -0,0 +1,116 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Jul  6 14:02:24 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: space object to pass to HyperoptPipelineSelection class
			
 
				+"""
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
			
 
				+    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
			
 
				+from xgboost import XGBRFClassifier
			
 
				+from sklearn.svm import SVC
			
 
				+from sklearn.linear_model import LogisticRegression
			
 
				+from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+from hyperopt import hp
			
 
				+
			
 
				+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
			
 
				+
			
 
				+# TODO: add sample spaces for encoders and transformers
			
 
				+
			
 
				+encoders = []
			
 
				+
			
 
				+transformers = []
			
 
				+
			
 
				+selectors = [
			
 
				+    {"name": "kbest",
			
 
				+     "object": SelectPercentile(),
			
 
				+     "params": {
			
 
				+       "percentile": 3 + hp.randint("kbest__percentile", 60),
			
 
				+       "score_func": hp.choice("kbest__score_func",
			
 
				+                               [f_classif, chi2, mutual_info_classif])}},
			
 
				+
			
 
				+    {"name": "fpr",
			
 
				+     "object": SelectFpr(),
			
 
				+     "params": {
			
 
				+        "score_func": hp.choice("fpr__score_func",
			
 
				+                                [f_classif, chi2]),
			
 
				+        # mutual_info_classif does not work here
			
 
				+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
			
 
				+
			
 
				+    {"name": "rfe_rf",
			
 
				+     "object":
			
 
				+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
			
 
				+     "params": {
			
 
				+         "n_features_to_select":
			
 
				+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
			
 
				+         "estimator__n_estimators":
			
 
				+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
			
 
				+
			
 
				+    {"name": "rfm_rf",
			
 
				+     "object":
			
 
				+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
			
 
				+                                                          random_state=33)),
			
 
				+     "params": {
			
 
				+         "estimator__n_estimators":
			
 
				+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
			
 
				+
			
 
				+    {"name": "rfm_lr",
			
 
				+     "object":
			
 
				+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
			
 
				+                                                      random_state=33)),
			
 
				+     "params": {
			
 
				+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
			
 
				+
			
 
				+    {"name": "std_scaler_pca",
			
 
				+     "object": Pipeline([
			
 
				+             ("scaler", StandardScaler()),
			
 
				+             ("pca", PCA(random_state=33))]),
			
 
				+     "params": {
			
 
				+        "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
			
 
				+       }}
			
 
				+    ]
			
 
				+
			
 
				+models = [
			
 
				+        {"name": "xgb",
			
 
				+         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
			
 
				+         "params": {
			
 
				+           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
			
 
				+           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
			
 
				+           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
			
 
				+           }},
			
 
				+
			
 
				+        {"name": "rf",
			
 
				+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
			
 
				+         "params": {
			
 
				+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
			
 
				+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
			
 
				+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
			
 
				+           }},
			
 
				+
			
 
				+        # the default solver does not accept l1 penalty
			
 
				+        {"name": "lr",
			
 
				+         "object": LogisticRegression(random_state=33,
			
 
				+                                      solver='liblinear',
			
 
				+                                      # n_jobs=-1
			
 
				+                                      ),
			
 
				+         "params":  {
			
 
				+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
			
 
				+           "C": hp.uniform("lr__C", 0.1, 1000)}},
			
 
				+
			
 
				+        # svc does not support parallelizaiton, therefore is slow
			
 
				+        {"name": "svc",
			
 
				+         "object": SVC(random_state=33),
			
 
				+         "params": {
			
 
				+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
			
 
				+            "degree": 2 + hp.randint("svc__degree", 3),
			
 
				+            "C": hp.uniform("svc__C", 0.1, 1000)
			
 
				+            }}
			
 
				+        ]
			
 
				+
			
 
				+step_list = [encoders, transformers, selectors, models]
			
 
				+
			
 
				+space = SpaceComposer().compose_hyperopt_space(step_list)
			
--- a/cdplib/hyperopt/space_sample.py
+++ b/cdplib/hyperopt/space_sample.py
@@ -0,0 +1,40 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Mon Oct  5 09:50:24 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from sklearn.ensemble import RandomForestClassifier
			
 
				+from sklearn.feature_selection import SelectPercentile
			
 
				+from sklearn.linear_model import LogisticRegression
			
 
				+from sklearn.decomposition import PCA
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.preprocessing import StandardScaler
			
 
				+from hyperopt import hp
			
 
				+import numpy as np
			
 
				+
			
 
				+
			
 
				+space = hp.choice("pipelines", [
			
 
				+
			
 
				+        {"name": "std_scaler_kbest_rf",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("kbest", SelectPercentile()),
			
 
				+                 ("rf", RandomForestClassifier())]),
			
 
				+         "params": {"kbest__percentile":
			
 
				+                    hp.choice('kbest__percentile', range(1, 3)),
			
 
				+                    "rf__n_estimators":
			
 
				+                    50 + hp.randint('rf__n_estimators', 50)}},
			
 
				+
			
 
				+        {"name": "std_scaler_pca_lr",
			
 
				+         "pipeline": Pipeline([
			
 
				+                 ("std_scaler", StandardScaler()),
			
 
				+                 ("pca", PCA()),
			
 
				+                 ("lr", LogisticRegression())]),
			
 
				+         "params": {"lr__C":
			
 
				+                    hp.loguniform("lr__C", np.log(0.01), np.log(0.1)),
			
 
				+                    "pca__n_components":
			
 
				+                    1 + hp.randint("pca__n_components", 4)}}
			
 
				+        ])
			
--- a/cdplib/hyperparameter_space_composer/SpaceComposer.py
+++ b/cdplib/hyperparameter_space_composer/SpaceComposer.py
@@ -0,0 +1,85 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 13:54:04 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: a class that from a given list of pipeline steps
			
 
				+ composes a space to be passed in the GridsearchPipelineSelector
			
 
				+ or HyperoptPipelineSelector classes.
			
 
				+ A classic list of steps would be: [encoders, transformers, selectors, models]
			
 
				+"""
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from hyperopt import hp
			
 
				+from itertools import product
			
 
				+
			
 
				+
			
 
				+class SpaceComposer:
			
 
				+    """
			
 
				+    A class that from a given list of pipeline steps
			
 
				+    composes a space to be passed to GridsearchPipelineSelector
			
 
				+    or HyperoptPipelineSelector.
			
 
				+    """
			
 
				+    def compose_gridsearch_space(self, step_list: list) -> list:
			
 
				+        """
			
 
				+        Composes a hyperparameter space for input to the
			
 
				+        GridsearchPipelineSelector class.
			
 
				+
			
 
				+        :param step_list: a classic list of steps would be
			
 
				+        [encoders, transformers, selectors, models],
			
 
				+        where, for example, selectors is a list
			
 
				+        of sklearn feature selectors, each selector given as a dict:
			
 
				+        for example {"name": "kbest",
			
 
				+                     "object": SelectPercentile(),
			
 
				+                     "params": {
			
 
				+                             "percentile":
			
 
				+                                 [5, 10, 20],
			
 
				+                             "score_func":
			
 
				+                                 [f_classif, chi2, mutual_info_classif]}}
			
 
				+
			
 
				+        :return: a list of dictionaries of form
			
 
				+            {"name": NAME, "pipeline": PIPELINE, "params": PARAMS}
			
 
				+        """
			
 
				+        space = []
			
 
				+
			
 
				+        step_combinations = product(*[step for step in
			
 
				+                                      step_list if len(step) > 0])
			
 
				+
			
 
				+        for step_combination in step_combinations:
			
 
				+
			
 
				+            space_element = {}
			
 
				+
			
 
				+            space_element["name"] = "_".join([step["name"]
			
 
				+                                              for step in step_combination])
			
 
				+
			
 
				+            space_element["pipeline"] = Pipeline(
			
 
				+                    [(step["name"], step["object"])
			
 
				+                     for step in step_combination])
			
 
				+
			
 
				+            space_element["params"] =\
			
 
				+                {step["name"] + "__" + param_name: param_dist
			
 
				+                 for step in step_combination
			
 
				+                 for param_name, param_dist
			
 
				+                 in step["params"].items()}
			
 
				+
			
 
				+            space.append(space_element)
			
 
				+
			
 
				+        return space
			
 
				+
			
 
				+    def compose_hyperopt_space(self, step_list: list) -> hp.choice:
			
 
				+        """
			
 
				+        Composes a hyperopt space from a list of steps.
			
 
				+        A classic list of steps would be
			
 
				+        [encoders, transformers, selectors, models],
			
 
				+        where, for example, selectors is a list
			
 
				+        of sklearn feature selectors, each selector given as a dict:
			
 
				+        for example {"name": "kbest",
			
 
				+                     "object": SelectPercentile(),
			
 
				+                     "params": {
			
 
				+                             "percentile":
			
 
				+                                 3 + hp.randint("kbest__percentile", 200),
			
 
				+                             "score_func":
			
 
				+                                 hp.choice("kbest__score_func",
			
 
				+                                    [f_classif, chi2, mutual_info_classif])}}
			
 
				+        """
			
 
				+        return hp.choice("pipelines", self.compose_gridsearch_space(step_list))
			
--- a/cdplib/hyperparameter_space_composer/__init__.py
+++ b/cdplib/hyperparameter_space_composer/__init__.py
@@ -0,0 +1 @@
 
				+from .SpaceComposer import *
			
--- a/cdplib/log.py
+++ b/cdplib/log.py
@@ -7,6 +7,7 @@ import sys
 
				 import os
			
 
				 import logging
			
 
				 from datetime import datetime
			
 
				+import warnings
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
@@ -121,9 +122,20 @@ class Log():
 
				 
			
 
				         raise Exception(message)
			
 
				 
			
 
				+    def log_and_throw_warning(self, message):
			
 
				+        '''
			
 
				+        '''
			
 
				+        self._logger.warning(message)
			
 
				+
			
 
				+        warnings.warn(message)
			
 
				+        
			
 
				     def log_and_raise_warning(self, message):
			
 
				         '''
			
 
				         '''
			
 
				+        warnings.warn(("This method has been depricated. "
			
 
				+                       "User log_and_throw_warning instead"),
			
 
				+                      DeprecationWarning)
			
 
				+        
			
 
				         self._logger.warning(message)
			
 
				 
			
 
				-        raise Warning(message)
			
 
				+        warnings.warn(message)
			
--- a/cdplib/ml_validation/CVComposer.py
+++ b/cdplib/ml_validation/CVComposer.py
@@ -0,0 +1,272 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Dec  9 10:27:39 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+from typing import Union, Iterable, Tuple, List, NewType
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from itertools import accumulate, repeat, takewhile, chain
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+CVType = NewType("CVType", Iterable[Tuple[List]])
			
 
				+
			
 
				+DataSetType = NewType("DataSetType",
			
 
				+                      Union[pd.DataFrame, pd.Series, np.ndarray, List])
			
 
				+
			
 
				+
			
 
				+class CVComposer:
			
 
				+    """
			
 
				+    Groups methods for composing cv objects
			
 
				+    that follow standards from sklearn,
			
 
				+    these cv objects can be passed to algorithms like gridsearch, etc
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        """
			
 
				+        """
			
 
				+        self._logger = Log("CVComposer: ")
			
 
				+
			
 
				+    def dummy_cv(
			
 
				+            self,
			
 
				+            train_set_size: Union[int, None] = None,
			
 
				+            train_index: Union[pd.Series, np.ndarray, None] = None,
			
 
				+            test_set_size: Union[int, None] = None,
			
 
				+            test_index: DataSetType = None) -> CVType:
			
 
				+        """
			
 
				+        """
			
 
				+        assert((train_index is None) != (train_set_size is None)),\
			
 
				+            "Set train_index or train_set_size"
			
 
				+
			
 
				+        assert((test_index is None) != (test_set_size is None)),\
			
 
				+            "Set train_index or train_set_size"
			
 
				+
			
 
				+        train_index = train_index if (train_index is not None)\
			
 
				+            else list(range(train_set_size))
			
 
				+
			
 
				+        test_index = test_index if (test_index is not None)\
			
 
				+            else list(range(train_set_size, train_set_size + test_set_size))
			
 
				+
			
 
				+        return [(train_index, test_index)]
			
 
				+
			
 
				+    def dummy_cv_and_concatenated_data_set(
			
 
				+            self,
			
 
				+            X_train: DataSetType,
			
 
				+            X_test: DataSetType,
			
 
				+            y_train: Union[DataSetType, None] = None,
			
 
				+            y_test: Union[DataSetType, None] = None)\
			
 
				+            -> Tuple[DataSetType, DataSetType, CVType]:
			
 
				+        """
			
 
				+        """
			
 
				+        assert((y_test is None) == (y_train is None))
			
 
				+
			
 
				+        use_index = (isinstance(X_train, pd.DataFrame) and
			
 
				+                     isinstance(X_test, pd.DataFrame) and
			
 
				+                     (len(set(X_train.index) and set(X_test.index)) == 0))
			
 
				+
			
 
				+        if use_index:
			
 
				+
			
 
				+            cv = self.dummy_cv(train_set_index=X_train.index,
			
 
				+                               test_set_index=X_test.index)
			
 
				+
			
 
				+            X = pd.concat([X_train, X_test], ignore_index=False, axis=0)
			
 
				+
			
 
				+        else:
			
 
				+            cv = self.dummy_cv(train_set_size=len(X_train),
			
 
				+                               test_set_size=len(X_test))
			
 
				+
			
 
				+            X = np.concatenate([X_train, X_test])
			
 
				+
			
 
				+        use_target_index = use_index and (
			
 
				+                    isinstance(y_train, pd.Series) and
			
 
				+                    isinstance(y_test, pd.Series) and
			
 
				+                    (X_train.index.equals(y_train.index)) and
			
 
				+                    (X_test.index.equals(y_test.index)))
			
 
				+
			
 
				+        if use_target_index:
			
 
				+
			
 
				+            y = pd.concat([y_train, y_test], ignore_index=False, axis=0)
			
 
				+
			
 
				+        else:
			
 
				+
			
 
				+            y = np.concatenate([y_train, y_test]) if (y_train is not None)\
			
 
				+                else None
			
 
				+
			
 
				+        result_to_np = (
			
 
				+            (isinstance(X_train, pd.DataFrame) !=
			
 
				+             isinstance(X_test, pd.DataFrame)) or
			
 
				+            (isinstance(X_train, pd.DataFrame)) and
			
 
				+            (len(set(X_train.index) and set(X_test.index)) != 0))
			
 
				+
			
 
				+        if result_to_np:
			
 
				+            self._logger.log_and_throw_warning(
			
 
				+                    "The concatenated dataframe is converted to numpy")
			
 
				+
			
 
				+        return cv, X, y
			
 
				+
			
 
				+    def expanding_cv(self, test_proportion: float,
			
 
				+                     start_train_proportion: float,
			
 
				+                     step_proportion: float = None,
			
 
				+                     expanding_test_size: bool = False,
			
 
				+                     data_set_size: Union[float, None] = None,
			
 
				+                     index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+            -> Union[Iterable[Tuple[List]], None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert((index is None) != (data_set_size is None)),\
			
 
				+                "Set index or data_set_size"
			
 
				+
			
 
				+            index = pd.Series(index) if (index is not None)\
			
 
				+                else pd.Series(range(data_set_size))
			
 
				+
			
 
				+            data_set_size = data_set_size or len(index)
			
 
				+
			
 
				+            start_train_size = int(start_train_proportion * data_set_size)
			
 
				+            step_size = int(step_proportion * data_set_size)
			
 
				+
			
 
				+            test_size = int(test_proportion * data_set_size)
			
 
				+
			
 
				+            train_inds_set = (list(range(train_size))
			
 
				+                              for train_size in
			
 
				+                              takewhile(
			
 
				+                                      lambda x: x <= data_set_size - test_size,
			
 
				+                                      accumulate(repeat(start_train_size),
			
 
				+                                                 lambda x, _: x + step_size)))
			
 
				+
			
 
				+            for train_inds in train_inds_set:
			
 
				+
			
 
				+                if expanding_test_size:
			
 
				+
			
 
				+                    yield (index[train_inds],
			
 
				+                           index[train_inds[-1] + 1:
			
 
				+                                 train_inds[-1] + 1
			
 
				+                                 + int(test_proportion*len(train_inds))])
			
 
				+
			
 
				+                else:
			
 
				+
			
 
				+                    yield (index[train_inds],
			
 
				+                           index[train_inds[-1] + 1:
			
 
				+                                 train_inds[-1] + 1 + test_size])
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._logger.log_and_raise_error(("Failed to make expanding cv. "
			
 
				+                                              "Exit with error: {}".format(e)))
			
 
				+
			
 
				+    def sliding_window_cv(
			
 
				+        self,
			
 
				+        test_proportion: float,
			
 
				+        train_proportion: float,
			
 
				+        step_proportion: float = None,
			
 
				+        data_set_size: Union[float, None] = None,
			
 
				+        index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+            -> Union[Iterable[Tuple[List]], None]:
			
 
				+        """
			
 
				+        """
			
 
				+        try:
			
 
				+            assert((index is None) != (data_set_size is None)),\
			
 
				+                "Set index or data_set_size"
			
 
				+
			
 
				+            index = pd.Series(index) if (index is not None)\
			
 
				+                else pd.Series(range(data_set_size))
			
 
				+
			
 
				+            data_set_size = data_set_size or len(index)
			
 
				+
			
 
				+            train_size = int(train_proportion * data_set_size)
			
 
				+            test_size = int(test_proportion * data_set_size)
			
 
				+            step_size = int(step_proportion * data_set_size)
			
 
				+
			
 
				+            train_sizes = takewhile(lambda x: x <= data_set_size - test_size,
			
 
				+                                    accumulate(repeat(train_size),
			
 
				+                                               lambda x, _: x + step_size))
			
 
				+
			
 
				+            train_starts = takewhile(lambda x: x <= data_set_size
			
 
				+                                     - train_size - test_size,
			
 
				+                                     accumulate(repeat(step_size),
			
 
				+                                                lambda x, _: x + step_size))
			
 
				+
			
 
				+            train_starts = chain([0], train_starts)
			
 
				+
			
 
				+            train_inds_set = list(range(train_start, train_size)
			
 
				+                                  for train_start, train_size in
			
 
				+                                  zip(train_starts, train_sizes))
			
 
				+
			
 
				+            cv = ((index[train_inds], index[train_inds[-1] + 1:
			
 
				+                                            train_inds[-1] + 1 + test_size])
			
 
				+                  for train_inds in train_inds_set)
			
 
				+
			
 
				+            return cv
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            self._logger.log_and_raise_error(
			
 
				+                    ("Failed to make sliding window cv. "
			
 
				+                     "Exit with error: {}".format(e)))
			
 
				+            
			
 
				+    def nested_expanding_cv(self,
			
 
				+            test_proportion: float,
			
 
				+            start_train_proportion: float,
			
 
				+            step_proportion: float = None,
			
 
				+            expanding_test_size: bool = False,
			
 
				+            data_set_size: Union[float, None] = None,
			
 
				+            index: Union[pd.Series, np.ndarray, list, None] = None)\
			
 
				+            -> Iterable[Tuple[List]]:
			
 
				+        """
			
 
				+        """
			
 
				+        logger = Log("make_nested_expanding_cv:")
			
 
				+    
			
 
				+        try:
			
 
				+            cv = self.expanding_cv(test_proportion=test_proportion,
			
 
				+                                   start_train_proportion=start_train_proportion,
			
 
				+                                   step_proportion=step_proportion,
			
 
				+                                   expanding_test_size=expanding_test_size,
			
 
				+                                   data_set_size=data_set_size,
			
 
				+                                   index=index)
			
 
				+    
			
 
				+            nested_cv = []
			
 
				+    
			
 
				+            for train_inds, test_inds in cv:
			
 
				+    
			
 
				+                fold_index = train_inds if index is not None\
			
 
				+                    else None
			
 
				+    
			
 
				+                fold_size = len(train_inds) if index is None else None
			
 
				+    
			
 
				+                fold_cv = self.expanding_cv(
			
 
				+                        test_proportion=test_proportion,
			
 
				+                        start_train_proportion=start_train_proportion,
			
 
				+                        step_proportion=step_proportion,
			
 
				+                        expanding_test_size=expanding_test_size,
			
 
				+                        data_set_size=fold_size,
			
 
				+                        index=fold_index)
			
 
				+    
			
 
				+                nested_cv.append(list(fold_cv))
			
 
				+    
			
 
				+            return nested_cv
			
 
				+    
			
 
				+        except Exception as e:
			
 
				+            logger.log_and_raise_error(("Failed to make nested expanding cv. "
			
 
				+                                        "Exit with error: {}".format(e)))
			
 
				+    
			
 
				+    
			
 
				+    def cv_slice_dataset(self, X, y, train_inds, test_inds)\
			
 
				+            -> Tuple[Union[pd.DataFrame, np.ndarray],
			
 
				+                     Union[pd.Series, np.ndarray]]:
			
 
				+        """
			
 
				+        """
			
 
				+        if isinstance(X, pd.DataFrame):
			
 
				+            X_train = X.loc[train_inds]
			
 
				+            X_val = X.loc[test_inds]
			
 
				+        else:
			
 
				+            X_train = X[train_inds]
			
 
				+            X_val = X[test_inds]
			
 
				+    
			
 
				+        if y is not None:
			
 
				+            y_train = y[train_inds]
			
 
				+            y_val = y[test_inds]
			
 
				+    
			
 
				+        return X_train, X_val, y_train, y_val
			
 
				+
			
--- a/cdplib/ml_validation/__init__.py
+++ b/cdplib/ml_validation/__init__.py
@@ -0,0 +1,2 @@
 
				+from .cross_validate_with_fine_tuning import *
			
 
				+from .CVComposer import *
			
--- a/cdplib/ml_validation/cross_validate_with_fine_tuning.py
+++ b/cdplib/ml_validation/cross_validate_with_fine_tuning.py
@@ -0,0 +1,387 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Oct 29 13:58:23 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+
			
 
				+
			
 
				+@description:
			
 
				+
			
 
				+* Input:
			
 
				+    - pipeline/hyperparameter space
			
 
				+    - data_train
			
 
				+    - cv
			
 
				+    - cv_folds
			
 
				+
			
 
				+* For each pipeline:
			
 
				+
			
 
				+    -> Split data_train into folds according to cv
			
 
				+
			
 
				+     -> For each fold:
			
 
				+
			
 
				+         => get data_train_fold, data_test_fold, cv_fold
			
 
				+
			
 
				+         => split data_train_fold into subfolds according to cv_fold
			
 
				+
			
 
				+         => For each subfold:
			
 
				+
			
 
				+             ==> get data_train_subfold, data_test_subfold
			
 
				+
			
 
				+             ==> train pipeline on data_train_subfold
			
 
				+
			
 
				+             ==> find best_threshold_subfold on data_test_subfold
			
 
				+
			
 
				+        => Find averaged_threshold_fold averaged over best_threshold_subfold
			
 
				+
			
 
				+        => train pipeline on data_train_fold
			
 
				+
			
 
				+        => find score_fold on data_test_fold with proba_threshold_fold
			
 
				+
			
 
				+        => find best_threshold_fold on data_test_fold
			
 
				+
			
 
				+    -> find score averaged over score_fold
			
 
				+
			
 
				+    -> find averaged_threshold averaged over best_threshold_fold
			
 
				+
			
 
				+* choose (pipeline/hyperparameters, threshold) in the space with best score
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from itertools import zip_longest
			
 
				+
			
 
				+if sys.version_info >= (3, 8):
			
 
				+    from typing import Callable, Dict, Iterable, Union
			
 
				+else:
			
 
				+    from typing_extensions import Callable, Dict, Iterable, Union
			
 
				+
			
 
				+from copy import deepcopy
			
 
				+
			
 
				+from sklearn.model_selection import StratifiedKFold
			
 
				+
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+from cdplib.ml_validation.CVComposer import CVComposer
			
 
				+
			
 
				+
			
 
				+# TODO: write with yield !!!!
			
 
				+
			
 
				+def get_optimal_proba_threshold(score_func: Callable,
			
 
				+                                y_true: Union[pd.Series, np.ndarray],
			
 
				+                                proba: Union[pd.Series, np.ndarray],
			
 
				+                                threshold_set: Union[Iterable, None] = None):
			
 
				+    """
			
 
				+    """
			
 
				+    scores = {}
			
 
				+
			
 
				+    if threshold_set is None:
			
 
				+        threshold_set = np.arange(0, 1, 0.1)
			
 
				+
			
 
				+    for threshold in threshold_set:
			
 
				+
			
 
				+        y_pred = (proba >= threshold).astype(int)
			
 
				+
			
 
				+        scores[threshold] = score_func(y_true, y_pred)
			
 
				+
			
 
				+    return max(scores, key=scores.get)
			
 
				+
			
 
				+
			
 
				+def cross_validate_with_optimal_threshold(
			
 
				+        score_func_threshold: Callable,
			
 
				+        estimator: object,
			
 
				+        X: Union[pd.DataFrame, np.ndarray],
			
 
				+        y: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        scoring: Union[Callable, Dict] = None,
			
 
				+        cv: Union[Iterable, int, None] = None,
			
 
				+        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
			
 
				+        y_val: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
			
 
				+        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
			
 
				+        cv_threshold: Union[Iterable, int, None] = None,
			
 
				+        threshold_set: Union[Iterable, None] = None,
			
 
				+        scores: Dict = None)-> Dict:
			
 
				+    """
			
 
				+    """
			
 
				+    logger = Log("cross_validate_with_optimal_threshold:")
			
 
				+
			
 
				+    X_train = deepcopy(X)
			
 
				+    y_train = deepcopy(y)
			
 
				+    X_val = deepcopy(X_val)
			
 
				+    y_val = deepcopy(y_val)
			
 
				+    X_val_threshold = deepcopy(X_val_threshold)
			
 
				+    y_val_threshold = deepcopy(y_val_threshold)
			
 
				+
			
 
				+    scores = scores or {"test_threshold": [],
			
 
				+                        "test_score_threshold": [],
			
 
				+                        "train_score_threshold": []}
			
 
				+
			
 
				+    scoring = scoring or {}
			
 
				+
			
 
				+    for metric_name, metric in scoring.items():
			
 
				+        if "test_" + metric_name not in scores:
			
 
				+            scores["test_" + metric_name] = []
			
 
				+            scores["train_" + metric_name] = []
			
 
				+
			
 
				+    if cv is None:
			
 
				+
			
 
				+        # test score is calculated on X_vals
			
 
				+
			
 
				+        assert((X_val is not None) and (y_val is not None)),\
			
 
				+            "Validation set must be set"
			
 
				+
			
 
				+        if cv_threshold is None:
			
 
				+
			
 
				+            refit = (X_val_threshold is not None)
			
 
				+
			
 
				+            # if a validation set for proba threshold tuning is not given,
			
 
				+            # we use the validation set on which we calculate the test score
			
 
				+            # (this might lead to overfitting)
			
 
				+
			
 
				+            X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
			
 
				+            y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
			
 
				+
			
 
				+            cv_threshold, X_train, y_train =\
			
 
				+                CVComposer().dummy_cv_and_concatenated_data_set(
			
 
				+                    X_train=X_train, 
			
 
				+                    X_test=X_val_threshold,
			
 
				+                    y_train=y_train,
			
 
				+                    y_test=y_val_threshold)
			
 
				+        else:
			
 
				+
			
 
				+            # if cv_threshold is given, we find the optimal threshold
			
 
				+            # on each fold and output the average value for the threshold
			
 
				+
			
 
				+            if (X_val_threshold is not None):
			
 
				+                logger.log_and_throw_warning((
			
 
				+                        "X_val_threshold is set "
			
 
				+                        "but cv_threshold will be used"))
			
 
				+
			
 
				+            if isinstance(cv_threshold, int):
			
 
				+                cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
			
 
				+                    .split(X=X_train, y=y_train)
			
 
				+
			
 
				+            refit = True
			
 
				+
			
 
				+        thresholds = []
			
 
				+
			
 
				+        for train_inds, val_inds in cv_threshold:
			
 
				+
			
 
				+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
			
 
				+                CVComposer().cv_slice_dataset(
			
 
				+                    X=X_train,
			
 
				+                    y=y_train,
			
 
				+                    train_inds=train_inds,
			
 
				+                    test_inds=val_inds)
			
 
				+
			
 
				+            estimator.fit(X_train_fold, y_train_fold)
			
 
				+
			
 
				+            proba_val = estimator.predict_proba(X_val_fold)[:, 1]
			
 
				+
			
 
				+            threshold = get_optimal_proba_threshold(
			
 
				+                score_func=score_func_threshold,
			
 
				+                y_true=y_val_fold,
			
 
				+                proba=proba_val)
			
 
				+
			
 
				+            thresholds.append(threshold)
			
 
				+
			
 
				+        scores["test_threshold"].append(np.mean(thresholds))
			
 
				+
			
 
				+        if refit:
			
 
				+
			
 
				+            estimator.fit(X_train, y_train)
			
 
				+
			
 
				+            proba_val = estimator.predict_proba(X_val)[:, 1]
			
 
				+
			
 
				+        proba_train = estimator.predict_proba(X_train)[:, 1]
			
 
				+
			
 
				+        pred_train = (proba_train >= threshold)
			
 
				+        pred_val = (proba_val >= threshold)
			
 
				+
			
 
				+        train_score = score_func_threshold(y_train, pred_train)
			
 
				+        test_score = score_func_threshold(y_val, pred_val)
			
 
				+
			
 
				+        for metric_name, metric in scoring.items():
			
 
				+            scores["train_" + metric_name].append(metric(y_train, pred_train))
			
 
				+            scores["test_" + metric_name].append(metric(y_val, pred_val))
			
 
				+
			
 
				+        scores["train_score_threshold"].append(train_score)
			
 
				+        scores["test_score_threshold"].append(test_score)
			
 
				+
			
 
				+        return scores
			
 
				+
			
 
				+    else:
			
 
				+
			
 
				+        if isinstance(cv, int):
			
 
				+            cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
			
 
				+
			
 
				+        cv_threshold = cv_threshold or []
			
 
				+
			
 
				+        for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
			
 
				+
			
 
				+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
			
 
				+                CVComposer().cv_slice_dataset(
			
 
				+                    X=X_train,
			
 
				+                    y=y_train,
			
 
				+                    train_inds=train_inds,
			
 
				+                    test_inds=val_inds)
			
 
				+
			
 
				+            scores = cross_validate_with_optimal_threshold(
			
 
				+                    estimator=estimator,
			
 
				+                    score_func_threshold=score_func_threshold,
			
 
				+                    X=X_train_fold,
			
 
				+                    y=y_train_fold,
			
 
				+                    X_val=X_val_fold,
			
 
				+                    y_val=y_val_fold,
			
 
				+                    cv_threshold=cv_fold,
			
 
				+                    scoring=scoring,
			
 
				+                    threshold_set=threshold_set,
			
 
				+                    scores=scores)
			
 
				+
			
 
				+        return scores
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    from sklearn.metrics import accuracy_score, precision_score
			
 
				+    from sklearn.datasets import load_breast_cancer
			
 
				+    from xgboost import XGBRFClassifier
			
 
				+    from sklearn.model_selection import train_test_split
			
 
				+
			
 
				+    data_loader = load_breast_cancer()
			
 
				+
			
 
				+    X = data_loader["data"]
			
 
				+    y = data_loader["target"]
			
 
				+
			
 
				+    X_train, X_val, y_train, y_val = train_test_split(X, y)
			
 
				+
			
 
				+    estimator = XGBRFClassifier(use_label_encoder=False,
			
 
				+                                eval_metric="logloss")
			
 
				+
			
 
				+    score_func = accuracy_score
			
 
				+
			
 
				+    scoring = {"precision": precision_score}
			
 
				+
			
 
				+    averaged_scores = []
			
 
				+    averaged_thresholds = []
			
 
				+
			
 
				+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            score_func_threshold=accuracy_score,
			
 
				+            estimator=estimator,
			
 
				+            X=X_train,
			
 
				+            y=y_train,
			
 
				+            scoring=scoring,
			
 
				+            cv=None,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=None,
			
 
				+            y_val_threshold=None,
			
 
				+            cv_threshold=None)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    X_train, X_val_threshold, y_train, y_val_threshold =\
			
 
				+        train_test_split(X_train, y_train)
			
 
				+
			
 
				+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            score_func_threshold=accuracy_score,
			
 
				+            estimator=estimator,
			
 
				+            X=X_train,
			
 
				+            y=y_train,
			
 
				+            scoring=scoring,
			
 
				+            cv=None,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv_threshold=None)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    print("\nTesting cv=None, cv_threshold=3 \n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            score_func_threshold=accuracy_score,
			
 
				+            estimator=estimator,
			
 
				+            X=X_train,
			
 
				+            y=y_train,
			
 
				+            scoring=scoring,
			
 
				+            cv=None,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv_threshold=3)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    print("\nTesting cv=3, cv_threshold=None \n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            score_func_threshold=accuracy_score,
			
 
				+            estimator=estimator,
			
 
				+            X=X_train,
			
 
				+            y=y_train,
			
 
				+            scoring=scoring,
			
 
				+            cv=3,
			
 
				+            X_val=None,
			
 
				+            y_val=None,
			
 
				+            X_val_threshold=None,
			
 
				+            y_val_threshold=None,
			
 
				+            cv_threshold=None)
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
			
 
				+
			
 
				+    scores = cross_validate_with_optimal_threshold(
			
 
				+            score_func_threshold=accuracy_score,
			
 
				+            estimator=estimator,
			
 
				+            X=X_train,
			
 
				+            y=y_train,
			
 
				+            scoring=scoring,
			
 
				+            cv=3,
			
 
				+            X_val=X_val,
			
 
				+            y_val=y_val,
			
 
				+            X_val_threshold=X_val_threshold,
			
 
				+            y_val_threshold=y_val_threshold,
			
 
				+            cv_threshold=[3, 3, 3])
			
 
				+
			
 
				+    print("\nScores:", scores)
			
 
				+
			
 
				+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
			
 
				+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
			
 
				+
			
 
				+    print("\n ########################################################## \n")
			
 
				+
			
 
				+    # TODO: check overwriting X_train,
			
 
				+    # additional metrics append instead of overwrite
			
 
				+    # check the length of cv_threshold
			
 
				+    # test custom cv, cv_threshold
			
 
				+
			
 
				+    print("\n Averaged test score:", averaged_scores)
			
 
				+    print("\n Averaged threshold:", averaged_thresholds)
			
--- a/cdplib/pipeline_selector/PipelineSelector.py
+++ b/cdplib/pipeline_selector/PipelineSelector.py
@@ -0,0 +1,824 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Wed Sep 30 14:23:23 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: an abstract class for selecting a machine learning
			
 
				+ pipeline from a space (deterministic or random) of parameter distributions
			
 
				+ over multiple pipelines.
			
 
				+ The selection is thought in such a way that a Trials object is being
			
 
				+ maintained during the tuning process from which one can retrieve
			
 
				+ the best pipeline so far
			
 
				+ as well as the entire tuning history if needed.
			
 
				+ Methods configure_cross_validation and configure_result_saving
			
 
				+ allow to use a custom cross-validation method and
			
 
				+ save the current best result in a file or database during training.
			
 
				+ Children classes: hyperopt and custom gridsearch.
			
 
				+"""
			
 
				+
			
 
				+import pickle
			
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+import datetime
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from copy import deepcopy
			
 
				+from abc import ABC, abstractmethod, abstractproperty
			
 
				+
			
 
				+if sys.version_info >= (3, 8):
			
 
				+    from typing import Callable, TypedDict,\
			
 
				+    Literal, Dict, Iterable, List, Tuple, Union
			
 
				+else:
			
 
				+    from typing_extensions import Callable, TypedDict,\
			
 
				+    Literal, Dict, Iterable, List, Tuple, Union
			
 
				+
			
 
				+import functools
			
 
				+from sklearn.pipeline import Pipeline
			
 
				+from sklearn.model_selection import cross_validate as sklearn_cross_validation
			
 
				+from sklearn.metrics import make_scorer
			
 
				+from hyperopt import STATUS_OK, STATUS_FAIL
			
 
				+from cdplib.log import Log
			
 
				+from cdplib.utils import ExceptionsHandler
			
 
				+from cdplib.utils import LoadingUtils
			
 
				+from cdplib.ml_validation import CVComposer
			
 
				+
			
 
				+sys.path.append(os.getcwd())
			
 
				+
			
 
				+
			
 
				+class SpaceElementType(TypedDict):
			
 
				+    name: str
			
 
				+    pipeline: Pipeline
			
 
				+    params: dict
			
 
				+    
			
 
				+# TODO Tanya: add possibility to include confusion matrix in
			
 
				+# additional metrics
			
 
				+# check that cv object contains indices
			
 
				+
			
 
				+class PipelineSelector(ABC):
			
 
				+    """
			
 
				+    An abstract class for selecting a machine learning
			
 
				+    pipeline from a space (deterministic or random) of parameter
			
 
				+    distributions over multiple pipelines.
			
 
				+    The selection is though in such a way that a Trials object is being
			
 
				+    maintained during the tuning process from which one can retrieve
			
 
				+    the best pipeline so far as well as the entire tuning history
			
 
				+    if needed.
			
 
				+    Methods configure_cross_validation and configure_result_saving
			
 
				+    allow to use a custom cross-validation method and
			
 
				+    save the current best result in a file or database during training.
			
 
				+    Children classes: hyperopt and custom gridsearch.
			
 
				+    """
			
 
				+    def __init__(self,
			
 
				+                 cost_func: Union[Callable, str],
			
 
				+                 greater_is_better: bool,
			
 
				+                 trials_path: str,
			
 
				+                 backup_trials_freq: int = None,
			
 
				+                 cross_validation_needs_scorer: bool = True,
			
 
				+                 cross_val_averaging_func: Callable = np.mean,
			
 
				+                 additional_metrics: Dict[str, Callable] = None,
			
 
				+                 additional_averaging_funcs: Dict[str, Callable] = None,
			
 
				+                 strategy_name: str = None,
			
 
				+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
			
 
				+                 = "INFO"):
			
 
				+        """
			
 
				+        :param Callable cost_func: function to minimize or maximize
			
 
				+            over the elements of a given (pipeline/hyperparameter) space
			
 
				+
			
 
				+        :param bool greater_is_better: when True
			
 
				+            cost_func is maximized, else minimized.
			
 
				+
			
 
				+        :param str trials_path: path at which the trials object is saved
			
 
				+            in binary format. From the trials object we can
			
 
				+            select information about the obtained scores, score variations,
			
 
				+            and pipelines, and parameters tried out so far. If a trials object
			
 
				+            already exists at the given path, it is loaded and the
			
 
				+            search is continued, else, the search is started from scratch.
			
 
				+
			
 
				+        :param backup_trials_freq: frequecy in interations (trials)
			
 
				+            of saving the trials object at the trials_path.
			
 
				+            if None, the trials object is backed up avery time
			
 
				+            the score improves.
			
 
				+
			
 
				+        :param Callable cross_val_averaging_func: Function to aggregate
			
 
				+            the cross-validation scores of the cost_func.
			
 
				+            Example different from the mean: mean - c*var.
			
 
				+
			
 
				+        :param additional_metics: dict of additional metrics to keep track of
			
 
				+            in the trials of the form {"metric_name": metric}.
			
 
				+
			
 
				+        :param additional_averaging_funcs: functions used to aggregate
			
 
				+            the output of the cross_validate function.
			
 
				+            The output always contains the scores of the cost_func,
			
 
				+            additional_metrics (if it is not empty),
			
 
				+            but it can also contain additional information
			
 
				+            (like probability threshold for example)
			
 
				+            if different from cross_val_averaging_func.
			
 
				+            Of the form {"metric_name": averaging_func}
			
 
				+
			
 
				+            Remark:
			
 
				+
			
 
				+        :param str strategy_name:
			
 
				+            a strategy is defined by the data set (columns/features and rows),
			
 
				+            cv object, cost function.
			
 
				+            When the strategy changes, one must start with new trials.
			
 
				+
			
 
				+        :param str stdout_log_level: can be INFO, WARNING, ERROR
			
 
				+        """
			
 
				+        self._logger = Log("PipelineSelector: ",
			
 
				+                           stdout_log_level=stdout_log_level)
			
 
				+
			
 
				+        try:
			
 
				+
			
 
				+            ExceptionsHandler(self._logger)\
			
 
				+                .assert_is_directory(path=trials_path)
			
 
				+
			
 
				+            self.attached_space = False
			
 
				+            self.attached_data = False
			
 
				+            self.configured_cross_validation = False
			
 
				+            self.configured_summary_saving = False
			
 
				+
			
 
				+            self._cost_func = cost_func
			
 
				+            self._greater_is_better = greater_is_better
			
 
				+            # score factor is 1 when cost_func is minimized,
			
 
				+            # -1 when cost func is maximized
			
 
				+            self._score_factor = (not greater_is_better) - greater_is_better
			
 
				+            self._cross_val_averaging_func = cross_val_averaging_func
			
 
				+            self._additional_metrics = additional_metrics
			
 
				+            self._additional_averaging_funcs = additional_averaging_funcs or {}
			
 
				+            
			
 
				+            self.trials_path = trials_path
			
 
				+            self._backup_trials_freq = backup_trials_freq
			
 
				+
			
 
				+            self._strategy_name = strategy_name
			
 
				+            self._data_path = None
			
 
				+            self._cv_path = None
			
 
				+            self._X = None
			
 
				+            self._y = None
			
 
				+            self._cv = None
			
 
				+            self._space = None
			
 
				+
			
 
				+            # if cross-valition is not configured,
			
 
				+            # sklearn cross-validation method is taken by default
			
 
				+            self._cross_validation = sklearn_cross_validation
			
 
				+            
			
 
				+            self._cross_validation_needs_scorer = cross_validation_needs_scorer
			
 
				+
			
 
				+            # if a trials object already exists at the given path,
			
 
				+            # it is loaded and the search is continued. Else,
			
 
				+            # the search is started from the beginning.
			
 
				+            if os.path.isfile(self.trials_path):
			
 
				+
			
 
				+                with open(self.trials_path, "rb") as f:
			
 
				+                    self._trials = pickle.load(f)
			
 
				+                    
			
 
				+                if len(self._trials) == 0:
			
 
				+                    self._trials = None
			
 
				+                    
			
 
				+            else:
			
 
				+                self._trials = None
			
 
				+                
			
 
				+            if self._trials is not None:
			
 
				+
			
 
				+                self._start_iteration = self.number_of_trials
			
 
				+
			
 
				+                self.best_score = self.best_trial_score
			
 
				+
			
 
				+                self._logger.info(("Loaded an existing trials object"
			
 
				+                                   "Consisting of {} trials")
			
 
				+                                  .format(self._start_iteration))
			
 
				+
			
 
				+            else:
			
 
				+                self._logger.warning(("No existing trials object was found, "
			
 
				+                                      "Starting from scratch."))
			
 
				+
			
 
				+                self._trials = None
			
 
				+                self._start_iteration = 0
			
 
				+                self.best_score = np.nan
			
 
				+
			
 
				+            # keeping track of the current search iteration
			
 
				+            self._iteration = self._start_iteration
			
 
				+            self._score_improved = False
			
 
				+
			
 
				+            self.start_tuning_time = datetime.datetime.today()
			
 
				+            self.total_tuning_time = None
			
 
				+            self.finished_tuning = False
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to initialize the class. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _backup_trials(self) -> None:
			
 
				+        '''
			
 
				+        Pickles (Saves) the trials object in binary format.
			
 
				+        '''
			
 
				+        try:
			
 
				+            with open(self.trials_path, "wb") as f:
			
 
				+                pickle.dump(self._trials, f)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Could not backup trials. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def configure_cross_validation(self,
			
 
				+                                   cross_validation: Callable,
			
 
				+                                   kwargs: dict = None) -> None:
			
 
				+        """
			
 
				+        Method for attaching a custom cross-validation function
			
 
				+
			
 
				+        :param cross_validation: a function that has the same
			
 
				+             signature as sklearn.model_selection.cross_validate
			
 
				+        """
			
 
				+        try:
			
 
				+            kwargs = kwargs or {}
			
 
				+
			
 
				+            self._cross_validation = functools.partial(
			
 
				+                    cross_validation, **kwargs)
			
 
				+
			
 
				+            self.configured_cross_validation = True
			
 
				+
			
 
				+            self._logger.info("Configured cross validation")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to configure cross-validation. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def configure_cross_validation_from_module(self,
			
 
				+                                               module_path: str,
			
 
				+                                               name: str) -> None:
			
 
				+        """
			
 
				+        Attaches a cross-validation funciton defined in
			
 
				+        a different python model. This function must have
			
 
				+        the same signature as sklearn.model_seclection.cross_validate
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the cross_validation function is defined.
			
 
				+
			
 
				+        :param str name: name of the cross validation function
			
 
				+            loaded froma python module.
			
 
				+        """
			
 
				+        try:
			
 
				+            self._cross_validation = \
			
 
				+                LoadingUtils().load_from_module(
			
 
				+                        module_path=module_path, name=name)
			
 
				+
			
 
				+            self.configured_cross_validation = True
			
 
				+
			
 
				+            self._logger.info("Configured cross validation")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to load cross-validation from module. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def attach_space(self, space) -> None:
			
 
				+        """
			
 
				+        Method for attaching the pipeline/hyperparameter space
			
 
				+        over which the score_func is optimized.
			
 
				+
			
 
				+        :param space: space where
			
 
				+            the search is performed. A space might be either
			
 
				+            a list of dictionaries or a hyperopt space object
			
 
				+            the elements of which are dictionaries with keys:
			
 
				+            name, pipeline, params
			
 
				+        """
			
 
				+        try:
			
 
				+            self._space = space
			
 
				+
			
 
				+            self.attached_space = True
			
 
				+
			
 
				+            self._logger.info("Attached parameter distribution space")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach space. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def attach_space_from_module(self, module_path: str, name: str) -> None:
			
 
				+        """
			
 
				+        Attaches a space defined in a different python module.
			
 
				+
			
 
				+        :param str module_path: path to python module
			
 
				+            where the space is defined.
			
 
				+
			
 
				+        :param str name: name of the space loaded from
			
 
				+            a python module.
			
 
				+        """
			
 
				+        try:
			
 
				+            self._space = LoadingUtils().load_from_module(
			
 
				+                    module_path=module_path, name=name)
			
 
				+
			
 
				+            self.attached_space = True
			
 
				+
			
 
				+            self._logger.info("Attached parameter distribution space")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach space from module. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.loger_and_raise_error(err)
			
 
				+
			
 
				+    def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
			
 
				+                    y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
			
 
				+                    = None,
			
 
				+                    X_val: Union[pd.DataFrame, np.ndarray]
			
 
				+                    = None,
			
 
				+                    y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
			
 
				+                    = None,
			
 
				+                    cv: Union[Iterable[Tuple[List[int], List[int]]]]
			
 
				+                    = None) -> None:
			
 
				+        '''
			
 
				+        :param array X_train: data on which
			
 
				+            machine learning pipelines are trained
			
 
				+
			
 
				+        :param array y_train: optional, vector with targets,
			
 
				+            (None in case of unsupervided learning)
			
 
				+
			
 
				+        :param array X_val: optional, validation data.
			
 
				+            When not provided, cross-validated value
			
 
				+            of the cost_func is calculated.
			
 
				+
			
 
				+        :param array y_val: optional, validation targets
			
 
				+
			
 
				+        :param list cv: iterabe of tuples containing
			
 
				+            train and validation indices or an integer representing
			
 
				+            the number of folds for a random split of data
			
 
				+            during cross-validation
			
 
				+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
			
 
				+        '''
			
 
				+        try:
			
 
				+            assert((cv is None) == (X_val is not None)),\
			
 
				+                "Either cv or X_val must be provided"
			
 
				+
			
 
				+            if cv is None:
			
 
				+
			
 
				+                assert((y_val is None) == (y_train is None)),\
			
 
				+                    "y_train and y_val must be simultanious"
			
 
				+
			
 
				+                # Here we create a trivial cv object
			
 
				+                # with one validation split.
			
 
				+                
			
 
				+                # XXX Tanya finish here
			
 
				+                
			
 
				+                cv = CVComposer.dummy_cv()
			
 
				+
			
 
				+                train_inds = list(range(len(X_train)))
			
 
				+                val_inds = list(range(len(X_train),
			
 
				+                                      len(X_train) + len(X_val)))
			
 
				+
			
 
				+                self._cv = [(train_inds, val_inds)]
			
 
				+
			
 
				+                self._X = np.concatenate([X_train, X_val])
			
 
				+                self._y = None if y_train is None\
			
 
				+                    else np.concatenate([y_train, y_val])
			
 
				+
			
 
				+            else:
			
 
				+
			
 
				+                self._cv = cv
			
 
				+                self._X = X_train
			
 
				+                self._y = y_train
			
 
				+
			
 
				+            self.attached_data = True
			
 
				+
			
 
				+            self._logger.info("Attached data")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to attach data. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def attach_data_from_hdf5(self,
			
 
				+                              data_hdf5_store_path: str,
			
 
				+                              cv_pickle_path: str = None) -> None:
			
 
				+        """
			
 
				+        Method for attaching data from a hdf5 store
			
 
				+         and a cv object from a pickled file.
			
 
				+
			
 
				+         The hdf5 store is a binary file,
			
 
				+         after loading it, it is a dictionary with keys
			
 
				+         X_train (y_train, X_val, y_val).
			
 
				+
			
 
				+         The cv is loaded from a pickle file.
			
 
				+
			
 
				+         The reason to separate the data
			
 
				+         store from the cv store, is the hdf5 is optimized to
			
 
				+         store large dataframes (especially with simple types) and
			
 
				+         a a small list of lists like a cv-object is better
			
 
				+         to be stored as a pickle file.
			
 
				+
			
 
				+        :param str data_hdf5_store_path: path to the hdf5 store
			
 
				+            with train and validation data
			
 
				+        :param str cv_pickle_path: path to the pickle file with
			
 
				+            the cv data
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(os.path.isfile(data_hdf5_store_path)),\
			
 
				+                "Parameter hdf5_store_path is not a file"
			
 
				+                
			
 
				+            # close all opened files, because hdf5 will 
			
 
				+            # fail to reopen an opened (for some reason) file
			
 
				+            import tables
			
 
				+            tables.file._open_files.close_all()
			
 
				+
			
 
				+            store = pd.HDFStore(data_hdf5_store_path)
			
 
				+
			
 
				+            self._data_path = data_hdf5_store_path
			
 
				+
			
 
				+            data_input = {key: store[key] if key in store else None
			
 
				+                          for key in ["X_train", "y_train", "X_val", "y_val"]}
			
 
				+
			
 
				+            if cv_pickle_path is not None:
			
 
				+
			
 
				+                assert(os.path.isfile(cv_pickle_path)),\
			
 
				+                    "Parameter cv_pickle_path is not a file"
			
 
				+
			
 
				+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
			
 
				+
			
 
				+                self._cv_path = cv_pickle_path
			
 
				+
			
 
				+            else:
			
 
				+                data_input["cv"] = None
			
 
				+
			
 
				+            self.attach_data(**data_input)
			
 
				+
			
 
				+            store.close()
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to attach data. Exit with error: {}".format(e)
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    @property
			
 
				+    def default_summary(self) -> dict:
			
 
				+        """
			
 
				+        Default summary of the strategy.
			
 
				+        Every the _objective function is called
			
 
				+        the current score and the information
			
 
				+        about the tested space element is added to the
			
 
				+        summary and it is saved to the Trials.
			
 
				+        If summary saving is configured it is also
			
 
				+        saved to a file, or a database when the score improves.
			
 
				+        """
			
 
				+        summary = {}
			
 
				+
			
 
				+        if self._strategy_name is not None:
			
 
				+            summary["strategy_name"] = self._strategy_name
			
 
				+
			
 
				+        if isinstance(self._cost_func, str):
			
 
				+            summary["cost_func"] = self._cost_func
			
 
				+
			
 
				+        elif hasattr(self._cost_func, "__name__"):
			
 
				+            summary["cost_func"] = self._cost_func.__name__
			
 
				+
			
 
				+        summary["trials_path"] = self.trials_path
			
 
				+
			
 
				+        if self._data_path is not None:
			
 
				+            summary["data_path"] = self._data_path
			
 
				+
			
 
				+        if self._cv_path is not None:
			
 
				+            summary["cv_path"] = self._cv_path
			
 
				+
			
 
				+        summary["start_tuning_time"] = self.start_tuning_time
			
 
				+
			
 
				+        summary["iteration"] = self._iteration
			
 
				+
			
 
				+        return summary
			
 
				+
			
 
				+    def configer_summary_saving(self,
			
 
				+                                save_method: Callable
			
 
				+                                = functools.partial(
			
 
				+                                        pd.DataFrame.to_excel,
			
 
				+                                        **{"path_or_buf": "result.csv"}),
			
 
				+                                kwargs: dict = None) -> None:
			
 
				+        """
			
 
				+        When the score calculated by _objective function improves,
			
 
				+        the default summary is updated with information about the
			
 
				+        current score and pipeline/hyperparameters
			
 
				+        and can be saved to a file or database, depending
			
 
				+        on the configured save_method.
			
 
				+
			
 
				+        :param Callable save_method: method for saving the result
			
 
				+            of the pipeline selection. The method must accept
			
 
				+            a pandas DataFrame as argument.
			
 
				+            By default, saving to an excel file.
			
 
				+
			
 
				+            Examples:
			
 
				+                functools.partial(pd.DataFrame.to_csv,
			
 
				+                                  **{"path_or_buf": <PATH>})
			
 
				+                functools.partial(np.savetxt, **{"fname": <PATH>})
			
 
				+
			
 
				+                functools.partial(SQLHandler(<URI>).append_to_table,
			
 
				+                                  **{"tablename": <NAME>})
			
 
				+
			
 
				+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
			
 
				+                                  **{"collection_name": <NAME>})
			
 
				+
			
 
				+            using functools can be avoided by providing the kwarg argument
			
 
				+
			
 
				+        :param dict kwargs: a dictionary with keyword arguments
			
 
				+            (like tablename) to provide to the save_method
			
 
				+        """
			
 
				+        try:
			
 
				+            kwargs = kwargs or {}
			
 
				+
			
 
				+            self._save_method = functools.partial(save_method, **kwargs)
			
 
				+
			
 
				+            self.configured_summary_saving = True
			
 
				+
			
 
				+            self._logger.info("Configured summary saving")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Failed to configure the summary saving. "
			
 
				+                   "Exit with error {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _save_summary(self, summary: dict) -> None:
			
 
				+        """
			
 
				+        When the score calculated by _objective function improves,
			
 
				+        the default summary is updated with information about the
			
 
				+        current score and pipeline/hyperparameters
			
 
				+        and can be saved to a file or database, depending
			
 
				+        on the configured save_method.
			
 
				+        """
			
 
				+        try:
			
 
				+            assert(self.configured_summary_saving),\
			
 
				+                "Result saving must be configured first"
			
 
				+
			
 
				+            self._save_method(summary)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = ("Could not configure summary saving. "
			
 
				+                   "Exit with error: {}".format(e))
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
			
 
				+        """
			
 
				+        Calculates the averaged cross-validated score and score variance,
			
 
				+        as well as the averaged values and variances of the additional metrics.
			
 
				+
			
 
				+        This method is called in the _objective function that is
			
 
				+        passed to the hyperopt optimizer.
			
 
				+
			
 
				+        This function can be overriden, when the cost
			
 
				+        needs to be calculated differently,
			
 
				+        for example with a tensorflow model.
			
 
				+
			
 
				+        :param Pipeline pipeline: machine learning pipeline
			
 
				+            that will be evaluated with cross-validation
			
 
				+
			
 
				+        :return: dictionary with the aggregated
			
 
				+            cross-validation scores and
			
 
				+            the score variances for the scores in the output
			
 
				+            of the cross-validation function.
			
 
				+
			
 
				+            form of the output:
			
 
				+                {"score": 10, #score used in optimization,
			
 
				+                 "score_variance": 0.5
			
 
				+                 "additional_metric1": 5,
			
 
				+                 "additional_metric1_variance": 7}
			
 
				+
			
 
				+            a custom cross-validation function can also include for
			
 
				+            example probability threshold for each fold, then
			
 
				+            the output of this function will include the average
			
 
				+            value and the variance of the probability threshold
			
 
				+            over the folds.
			
 
				+        """
			
 
				+        try:
			
 
				+            
			
 
				+            scoring = {"score": self._cost_func} | self._additional_metrics
			
 
				+            
			
 
				+            if self._cross_validation_needs_scorer:
			
 
				+                for metric_name, metric in scoring.itmes():
			
 
				+                    scoring[metric_name] = make_scorer(
			
 
				+                        metric, greater_is_better=self._greater_is_better)
			
 
				+                    
			
 
				+            cross_validation_input_args = {
			
 
				+                 "estimator": pipeline,
			
 
				+                 "X": self._X,
			
 
				+                 "y": self._y,
			
 
				+                 "cv": self._cv,
			
 
				+                 "scoring": scoring
			
 
				+                 }
			
 
				+            
			
 
				+            if "error_score" in self._cross_validation.__annotations__:
			
 
				+                cross_validation_input_args["error_score"] = np.nan
			
 
				+
			
 
				+            scores = self._cross_validation(**cross_validation_input_args)
			
 
				+
			
 
				+            averaging_funcs = {
			
 
				+                    metric_name: self._additional_averaging_funcs[metric_name]
			
 
				+                    if metric_name in self._additional_averaging_funcs
			
 
				+                    else self._cross_val_averaging_func
			
 
				+                    for metric_name in scores}
			
 
				+
			
 
				+            scores_average = {
			
 
				+                    metric_name.replace("test_", ""):
			
 
				+                    averaging_funcs[metric_name](scores[metric_name])
			
 
				+                    for metric_name in scores
			
 
				+                    if metric_name.startswith("test")}
			
 
				+
			
 
				+            scores_variance = {
			
 
				+                    metric_name.replace("test_", "") + "_variance":
			
 
				+                    np.var(scores[metric_name])
			
 
				+                    for metric_name in scores
			
 
				+                    if metric_name.startswith("test")}
			
 
				+
			
 
				+            return {**scores_average, **scores_variance}
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
			
 
				+
			
 
				+            self._logger.log_and_raise_error(err)
			
 
				+
			
 
				+    def _objective(self, space_element: SpaceElementType) -> dict:
			
 
				+        '''
			
 
				+        This method is called in run_trials method
			
 
				+        that is using the hyperopt fmin opmizer.
			
 
				+
			
 
				+        Uses _evaluate method.
			
 
				+
			
 
				+        It must take as input a space element
			
 
				+        and produce an output in the form of dictionary
			
 
				+        with 2 obligatory values loss and status
			
 
				+        (STATUS_OK or STATUS_FAIL). Other
			
 
				+        values in the output are optional and can be
			
 
				+        accessed later through the trials object.
			
 
				+
			
 
				+        :Warning: fmin minimizes the loss,
			
 
				+        when _evaluate returns a value to be maximized,
			
 
				+        it is multiplied by -1 to obtain loss.
			
 
				+
			
 
				+        :param SpaceElementType space_element: element
			
 
				+            of the space over which the optimization is done
			
 
				+
			
 
				+        :output: dictionary with keys
			
 
				+            loss (minimized value),
			
 
				+            status with values STATUS_OK or STATUS_FAIL
			
 
				+            uderstood by hyperopt,
			
 
				+            score (equal to loss or -loss),
			
 
				+            score_variance,
			
 
				+            timestamp (end of execution),
			
 
				+            train_time: execution time
			
 
				+            and other keys given in self.default_summary
			
 
				+        '''
			
 
				+        try:
			
 
				+            start_time = time.time()
			
 
				+
			
 
				+            assert(self.attached_data),\
			
 
				+                ("Data must be attached in order "
			
 
				+                 "in order to effectuate the best"
			
 
				+                 "pipeline search")
			
 
				+
			
 
				+            summary = deepcopy(self.default_summary)
			
 
				+
			
 
				+            # backup the current trials if the score improved
			
 
				+            # at previous iteration or every ith iteration
			
 
				+            # if the backup_trials_freq is set
			
 
				+            backup_cond = ((self._backup_trials_freq is not None) and
			
 
				+                           ((self._iteration - self._start_iteration - 1) %
			
 
				+                            self._backup_trials_freq == 0)) or\
			
 
				+                self._score_improved
			
 
				+
			
 
				+            if backup_cond:
			
 
				+                self._backup_trials()
			
 
				+                self._score_improved = False
			
 
				+
			
 
				+            pipeline = space_element['pipeline']
			
 
				+            params = space_element['params']
			
 
				+            pipeline.set_params(**params)
			
 
				+
			
 
				+            self._logger.info(("Iteration {0}: "
			
 
				+                               "Current score is {1}: "
			
 
				+                               "Training pipeline {2} "
			
 
				+                               "with parameters: {3}. ").format(
			
 
				+                                  self._iteration,
			
 
				+                                  self.best_score,
			
 
				+                                  space_element['name'],
			
 
				+                                  params))
			
 
				+
			
 
				+            result = self._evaluate(pipeline)
			
 
				+
			
 
				+            summary.update(result)
			
 
				+
			
 
				+            end_time = time.time()
			
 
				+
			
 
				+            summary['status'] = STATUS_OK
			
 
				+            summary.update(result)
			
 
				+            summary['loss'] = self._score_factor * summary['score']
			
 
				+            summary['timestamp'] = datetime.datetime.today()
			
 
				+            summary['train_time'] = end_time - start_time
			
 
				+
			
 
				+            self._iteration += 1
			
 
				+
			
 
				+            self._score_improved = (self.best_score != self.best_score) or\
			
 
				+                                   (self._score_factor*result["score"] <
			
 
				+                                    self._score_factor*self.best_score)
			
 
				+
			
 
				+            if self._score_improved:
			
 
				+
			
 
				+                self._logger.info("Score improved, new best score is: {}"
			
 
				+                                  .format(result["score"]))
			
 
				+
			
 
				+                self.best_score = result['score']
			
 
				+
			
 
				+                if self.configured_summary_saving:
			
 
				+                    self._save_summary(summary)
			
 
				+
			
 
				+        except Exception as e:
			
 
				+
			
 
				+            self._logger.warning("Trial failed with error {}".format(e))
			
 
				+
			
 
				+            summary = {}
			
 
				+            summary['status'] = STATUS_FAIL
			
 
				+            summary['timestamp'] = datetime.datetime.today()
			
 
				+            summary['error'] = e
			
 
				+            for key in ['loss', 'score', 'score_variance', 'train_time']:
			
 
				+                summary[key] = np.nan
			
 
				+
			
 
				+        return summary
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def run_trials(self):
			
 
				+        """
			
 
				+        Method that runs the hyperparameter tuning over possibly multiple
			
 
				+        pipeline types specified in self.space
			
 
				+        When run_trials method is finished the flag self.finished_tuning
			
 
				+        should be set to True and the methods self._backup_trials and
			
 
				+        optionally self._save_result should be called.
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def number_of_trials(self) -> int:
			
 
				+        """
			
 
				+        Number of trials already run in the current trials object
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial(self) -> dict:
			
 
				+        """
			
 
				+        Best trial sor far.
			
 
				+         Should contain the status, pipeline,
			
 
				+         hyperparameters, and the score (loss).
			
 
				+         Other information is otional and is defined
			
 
				+         by self.default_summary
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial_score(self) -> float:
			
 
				+        """
			
 
				+        Score of the best pipeline with the best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial_score_variance(self) -> float:
			
 
				+        """
			
 
				+        Variance of the cross-validation score of the best pipeline
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractproperty
			
 
				+    def best_trial_pipeline(self) -> Pipeline:
			
 
				+        """
			
 
				+        Best pipeline with best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_n_best_trial_pipelines(self, n: int) -> list:
			
 
				+        """
			
 
				+        N best pipelines with corresponding
			
 
				+        best hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
			
 
				+        """
			
 
				+        If the hyperparameter search is done over multiple
			
 
				+        pipelines, then returns n different pipeline-types
			
 
				+        with corresponding hyperparameters
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def trials_to_excel(self, path: str) -> None:
			
 
				+        """
			
 
				+        Trials object in the shape of table written to excel,
			
 
				+        should contain the iteration, pipeline (as str),
			
 
				+        hyperparamters (as str), self.best_result (see self._objective method)
			
 
				+        as well as additional information defined by self.default_summary
			
 
				+        """
			
 
				+        pass
			
--- a/cdplib/pipeline_selector/__init__.py
+++ b/cdplib/pipeline_selector/__init__.py
@@ -0,0 +1 @@
 
				+from .PipelineSelector import *
			
--- a/cdplib/utils/ExceptionsHandler.py
+++ b/cdplib/utils/ExceptionsHandler.py
@@ -8,35 +8,45 @@ Created on Fri Sep 27 14:20:58 2019
 
				 
			
 
				 import os
			
 
				 import sys
			
 
				-import logging
			
 
				 import pandas as pd
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				 sys.path.append(os.getcwd())
			
 
				 
			
 
				 
			
 
				 class ExceptionsHandler:
			
 
				     '''
			
 
				     '''
			
 
				-    def __init__(self):
			
 
				+    def __init__(self, logger: Log = None):
			
 
				         '''
			
 
				         '''
			
 
				+        self._logger = logger or Log("ExceptionHandler")
			
 
				 
			
 
				-    def check_is_file(self, path, logger=None):
			
 
				+    def check_is_file(self, path: str):
			
 
				         '''
			
 
				         '''
			
 
				-        if logger is None:
			
 
				-            logger = logging.getLogger()
			
 
				-
			
 
				         if not os.path.isfile(path):
			
 
				             err = "File {} not found".format(path)
			
 
				-            logger.error(err)
			
 
				+            self._logger.error(err)
			
 
				             raise FileNotFoundError(err)
			
 
				 
			
 
				-    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
			
 
				-                               error_or_warning: str, logger = None):
			
 
				+    def assert_is_directory(self, path: str):
			
 
				+        ""
			
 
				+        ""
			
 
				+        assert(isinstance(path, str)),\
			
 
				+            "Parameter 'path' must of str type"
			
 
				+
			
 
				+        dirname = os.path.dirname("path")
			
 
				+
			
 
				+        if len(dirname) > 0:
			
 
				+            os.mkdir(dirname, exists_ok=True)
			
 
				+
			
 
				+    def _check_column_abscence(self,
			
 
				+                               columns: (str, list),
			
 
				+                               data: pd.DataFrame,
			
 
				+                               error_or_warning: str):
			
 
				         '''
			
 
				         '''
			
 
				-        if logger is None:
			
 
				-            logger = logging.getLogger()
			
 
				         if isinstance(columns, str):
			
 
				             columns = [columns]
			
 
				 
			
@@ -44,23 +54,23 @@ class ExceptionsHandler:
 
				 
			
 
				             if column not in data.columns:
			
 
				                 err = ("{} is not an internal column name".format(column))
			
 
				-                getattr(logger, error_or_warning)(err)
			
 
				+                getattr(self._logger, error_or_warning)(err)
			
 
				 
			
 
				                 if error_or_warning == "error":
			
 
				                     raise Exception(err)
			
 
				 
			
 
				-    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
			
 
				+    def error_column_abscence(self,
			
 
				+                              columns: (str, list),
			
 
				+                              data: pd.DataFrame):
			
 
				         '''
			
 
				         '''
			
 
				         return self._check_column_abscence(columns=columns,
			
 
				                                            data=data,
			
 
				-                                           error_or_warning="error",
			
 
				-                                           logger=logger)
			
 
				+                                           error_or_warning="error")
			
 
				 
			
 
				-    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
			
 
				+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
			
 
				         '''
			
 
				         '''
			
 
				         return self._check_column_abscence(columns=columns,
			
 
				                                            data=data,
			
 
				-                                           error_or_warning="warning",
			
 
				-                                           logger=logger)
			
 
				+                                           error_or_warning="warning")
			
--- a/cdplib/utils/LoadingUtils.py
+++ b/cdplib/utils/LoadingUtils.py
@@ -0,0 +1,46 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Thu Oct  1 12:58:58 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+@description: class for methods of loading data from external sources
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+class LoadingUtils:
			
 
				+    """
			
 
				+    """
			
 
				+    def __init__(self, logger=None):
			
 
				+        """
			
 
				+        """
			
 
				+        self._logger = logger or Log("LoadingUtils")
			
 
				+
			
 
				+    def load_from_module(self, module_path: str, name: str):
			
 
				+        """
			
 
				+        """
			
 
				+        for p in ["modele_path", "name"]:
			
 
				+            assert(isinstance(p, str)),\
			
 
				+                "Parameter '{}' must be of str type".format(p)
			
 
				+
			
 
				+            assert(os.path.isfile(module_path)),\
			
 
				+                "Parameter 'module_path' must be a valid file"
			
 
				+
			
 
				+            module, extension = os.path.splitext(os.path.basename(module_path))
			
 
				+
			
 
				+            assert(extension == ",py"),\
			
 
				+                "Parameter 'space' must be read from a python file"
			
 
				+
			
 
				+            sys.path.insert(module_path)
			
 
				+
			
 
				+            try:
			
 
				+                from module import name
			
 
				+                return name
			
 
				+
			
 
				+            except ImportError:
			
 
				+                err = "Invalid space location or name"
			
 
				+                self._logger.log_and_raise_error(err)
			
--- a/cdplib/utils/TypeConverter.py
+++ b/cdplib/utils/TypeConverter.py
@@ -0,0 +1,36 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+Created on Fri Apr 24 09:06:13 2020
			
 
				+
			
 
				+@author: tanya
			
 
				+"""
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+
			
 
				+class TypeConverter:
			
 
				+    """
			
 
				+    Library for methods to manage python types
			
 
				+    """
			
 
				+    def __init__(self):
			
 
				+        """
			
 
				+        """
			
 
				+        from cdplib.log import Log
			
 
				+
			
 
				+        self._logger = Log("TypeConverter")
			
 
				+
			
 
				+    def convert_to_ndarray(self, x: (pd.DataFrame, np.ndarray)) -> np.ndarray:
			
 
				+        '''
			
 
				+        Converts an DataFrame to an numpy array.
			
 
				+        '''
			
 
				+        if isinstance(x, np.ndarray):
			
 
				+            return x
			
 
				+
			
 
				+        elif (isinstance(x, pd.core.frame.DataFrame))\
			
 
				+                or (isinstance(x, pd.core.series.Series)):
			
 
				+            return x.values
			
 
				+
			
 
				+        else:
			
 
				+            self._logger.log_and_raise_error_stack_info(
			
 
				+                    'The argument must be a numpy array or a pandas DataFrame')