Преглед изворни кода

Merge branch 'master' of https://intra.acdp.at/gogs/tanja/cdplib

ogert пре 3 година
родитељ
комит
60abdd6dc6

+ 3 - 3
Pipfile

@@ -9,12 +9,12 @@ verify_ssl = true
 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
 pandas = "!=0.24.0"
 sqlalchemy = "*"
-sqlparse = "*"
-pymysql = "*"
+# sqlparse = "*"
+# pymysql = "*"
 pymongo = "*"
 jsonref = "*"
 simplejson = "*"
-mysql = "*"
+# mysql = "*"
 hyperopt = "*"
 influxdb = "*"
 

+ 112 - 146
Pipfile.lock

@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "1879ebbd4ee3fe44d9e59091889a69ead4c7b76e81b70de0dd74d12b5266cf42"
+            "sha256": "194ac762a9255b24372b56ca5cd3def753b14c78784407b69cba906417bc53b2"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -26,7 +26,7 @@
         "cdplib": {
             "editable": true,
             "git": "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git",
-            "ref": "623f7488557e373eb3181bb4099295ed17a53b5c"
+            "ref": "362220280f6768abda363240b5fce51eb4d9016e"
         },
         "certifi": {
             "hashes": [
@@ -130,14 +130,6 @@
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==2.10"
         },
-        "importlib-metadata": {
-            "hashes": [
-                "sha256:c9db46394197244adf2f0b08ec5bc3cf16757e9590b02af1fca085c16c0d600a",
-                "sha256:d2d46ef77ffc85cbf7dac7e81dd663fde71c45326131bea8033b9bad42268ebe"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==3.10.0"
-        },
         "influxdb": {
             "hashes": [
                 "sha256:46f85e7b04ee4b3dee894672be6a295c94709003a7ddea8820deec2ac4d8b27a",
@@ -199,7 +191,6 @@
             "hashes": [
                 "sha256:55e66b5e7b3823b1da5fb2a063e95a628fb850b2a0b76bdcd884faac5d2daa7d"
             ],
-            "index": "pypi",
             "version": "==0.0.2"
         },
         "mysqlclient": {
@@ -253,33 +244,25 @@
         },
         "pandas": {
             "hashes": [
-                "sha256:0a643bae4283a37732ddfcecab3f62dd082996021b980f580903f4e8e01b3c5b",
-                "sha256:0de3ddb414d30798cbf56e642d82cac30a80223ad6fe484d66c0ce01a84d6f2f",
-                "sha256:19a2148a1d02791352e9fa637899a78e371a3516ac6da5c4edc718f60cbae648",
-                "sha256:21b5a2b033380adbdd36b3116faaf9a4663e375325831dac1b519a44f9e439bb",
-                "sha256:24c7f8d4aee71bfa6401faeba367dd654f696a77151a8a28bc2013f7ced4af98",
-                "sha256:26fa92d3ac743a149a31b21d6f4337b0594b6302ea5575b37af9ca9611e8981a",
-                "sha256:2860a97cbb25444ffc0088b457da0a79dc79f9c601238a3e0644312fcc14bf11",
-                "sha256:2b1c6cd28a0dfda75c7b5957363333f01d370936e4c6276b7b8e696dd500582a",
-                "sha256:2c2f7c670ea4e60318e4b7e474d56447cf0c7d83b3c2a5405a0dbb2600b9c48e",
-                "sha256:3be7a7a0ca71a2640e81d9276f526bca63505850add10206d0da2e8a0a325dae",
-                "sha256:4c62e94d5d49db116bef1bd5c2486723a292d79409fc9abd51adf9e05329101d",
-                "sha256:5008374ebb990dad9ed48b0f5d0038124c73748f5384cc8c46904dace27082d9",
-                "sha256:5447ea7af4005b0daf695a316a423b96374c9c73ffbd4533209c5ddc369e644b",
-                "sha256:573fba5b05bf2c69271a32e52399c8de599e4a15ab7cec47d3b9c904125ab788",
-                "sha256:5a780260afc88268a9d3ac3511d8f494fdcf637eece62fb9eb656a63d53eb7ca",
-                "sha256:70865f96bb38fec46f7ebd66d4b5cfd0aa6b842073f298d621385ae3898d28b5",
-                "sha256:731568be71fba1e13cae212c362f3d2ca8932e83cb1b85e3f1b4dd77d019254a",
-                "sha256:b61080750d19a0122469ab59b087380721d6b72a4e7d962e4d7e63e0c4504814",
-                "sha256:bf23a3b54d128b50f4f9d4675b3c1857a688cc6731a32f931837d72effb2698d",
-                "sha256:c16d59c15d946111d2716856dd5479221c9e4f2f5c7bc2d617f39d870031e086",
-                "sha256:c61c043aafb69329d0f961b19faa30b1dab709dd34c9388143fc55680059e55a",
-                "sha256:c94ff2780a1fd89f190390130d6d36173ca59fcfb3fe0ff596f9a56518191ccb",
-                "sha256:edda9bacc3843dfbeebaf7a701763e68e741b08fccb889c003b0a52f0ee95782",
-                "sha256:f10fc41ee3c75a474d3bdf68d396f10782d013d7f67db99c0efbfd0acb99701b"
+                "sha256:167693a80abc8eb28051fbd184c1b7afd13ce2c727a5af47b048f1ea3afefff4",
+                "sha256:2111c25e69fa9365ba80bbf4f959400054b2771ac5d041ed19415a8b488dc70a",
+                "sha256:298f0553fd3ba8e002c4070a723a59cdb28eda579f3e243bc2ee397773f5398b",
+                "sha256:2b063d41803b6a19703b845609c0b700913593de067b552a8b24dd8eeb8c9895",
+                "sha256:2cb7e8f4f152f27dc93f30b5c7a98f6c748601ea65da359af734dd0cf3fa733f",
+                "sha256:52d2472acbb8a56819a87aafdb8b5b6d2b3386e15c95bde56b281882529a7ded",
+                "sha256:612add929bf3ba9d27b436cc8853f5acc337242d6b584203f207e364bb46cb12",
+                "sha256:649ecab692fade3cbfcf967ff936496b0cfba0af00a55dfaacd82bdda5cb2279",
+                "sha256:68d7baa80c74aaacbed597265ca2308f017859123231542ff8a5266d489e1858",
+                "sha256:8d4c74177c26aadcfb4fd1de6c1c43c2bf822b3e0fc7a9b409eeaf84b3e92aaa",
+                "sha256:971e2a414fce20cc5331fe791153513d076814d30a60cd7348466943e6e909e4",
+                "sha256:9db70ffa8b280bb4de83f9739d514cd0735825e79eef3a61d312420b9f16b758",
+                "sha256:b730add5267f873b3383c18cac4df2527ac4f0f0eed1c6cf37fcb437e25cf558",
+                "sha256:bd659c11a4578af740782288cac141a322057a2e36920016e0fc7b25c5a4b686",
+                "sha256:c601c6fdebc729df4438ec1f62275d6136a0dd14d332fc0e8ce3f7d2aadb4dd6",
+                "sha256:d0877407359811f7b853b548a614aacd7dea83b0c0c84620a9a643f180060950"
             ],
             "index": "pypi",
-            "version": "==1.1.5"
+            "version": "==1.2.4"
         },
         "pymongo": {
             "hashes": [
@@ -356,7 +339,7 @@
                 "sha256:41fc3a0c5013d5f039639442321185532e3e2c8924687abe6537de157d403641",
                 "sha256:816927a350f38d56072aeca5dfb10221fe1dc653745853d30a216637f5d7ad36"
             ],
-            "index": "pypi",
+            "markers": "python_version >= '3.6'",
             "version": "==1.0.2"
         },
         "python-dateutil": {
@@ -384,67 +367,67 @@
         },
         "scikit-learn": {
             "hashes": [
-                "sha256:0567a2d29ad08af98653300c623bd8477b448fe66ced7198bef4ed195925f082",
-                "sha256:087dfede39efb06ab30618f9ab55a0397f29c38d63cd0ab88d12b500b7d65fd7",
-                "sha256:1adf483e91007a87171d7ce58c34b058eb5dab01b5fee6052f15841778a8ecd8",
-                "sha256:259ec35201e82e2db1ae2496f229e63f46d7f1695ae68eef9350b00dc74ba52f",
-                "sha256:3c4f07f47c04e81b134424d53c3f5e16dfd7f494e44fd7584ba9ce9de2c5e6c1",
-                "sha256:4562dcf4793e61c5d0f89836d07bc37521c3a1889da8f651e2c326463c4bd697",
-                "sha256:4ddd2b6f7449a5d539ff754fa92d75da22de261fd8fdcfb3596799fadf255101",
-                "sha256:54be0a60a5a35005ad69c75902e0f5c9f699db4547ead427e97ef881c3242e6f",
-                "sha256:5580eba7345a4d3b097be2f067cc71a306c44bab19e8717a30361f279c929bea",
-                "sha256:7b04691eb2f41d2c68dbda8d1bd3cb4ef421bdc43aaa56aeb6c762224552dfb6",
-                "sha256:826b92bf45b8ad80444814e5f4ac032156dd481e48d7da33d611f8fe96d5f08b",
-                "sha256:83b21ff053b1ff1c018a2d24db6dd3ea339b1acfbaa4d9c881731f43748d8b3b",
-                "sha256:8772b99d683be8f67fcc04789032f1b949022a0e6880ee7b75a7ec97dbbb5d0b",
-                "sha256:895dbf2030aa7337649e36a83a007df3c9811396b4e2fa672a851160f36ce90c",
-                "sha256:8aa1b3ac46b80eaa552b637eeadbbce3be5931e4b5002b964698e33a1b589e1e",
-                "sha256:9599a3f3bf33f73fed0fe06d1dfa4e6081365a58c1c807acb07271be0dce9733",
-                "sha256:99349d77f54e11f962d608d94dfda08f0c9e5720d97132233ebdf35be2858b2d",
-                "sha256:9a24d1ccec2a34d4cd3f2a1f86409f3f5954cc23d4d2270ba0d03cf018aa4780",
-                "sha256:9bed8a1ef133c8e2f13966a542cb8125eac7f4b67dcd234197c827ba9c7dd3e0",
-                "sha256:9c6097b6a9b2bafc5e0f31f659e6ab5e131383209c30c9e978c5b8abdac5ed2a",
-                "sha256:9dfa564ef27e8e674aa1cc74378416d580ac4ede1136c13dd555a87996e13422",
-                "sha256:a0334a1802e64d656022c3bfab56a73fbd6bf4b1298343f3688af2151810bbdf",
-                "sha256:a29460499c1e62b7a830bb57ca42e615375a6ab1bcad053cd25b493588348ea8",
-                "sha256:a36e159a0521e13bbe15ca8c8d038b3a1dd4c7dad18d276d76992e03b92cf643",
-                "sha256:abe835a851610f87201819cb315f8d554e1a3e8128912783a31e87264ba5ffb7",
-                "sha256:c13ebac42236b1c46397162471ea1c46af68413000e28b9309f8c05722c65a09",
-                "sha256:c3deb3b19dd9806acf00cf0d400e84562c227723013c33abefbbc3cf906596e9",
-                "sha256:c658432d8a20e95398f6bb95ff9731ce9dfa343fdf21eea7ec6a7edfacd4b4d9",
-                "sha256:c7f4eb77504ac586d8ac1bde1b0c04b504487210f95297235311a0ab7edd7e38",
-                "sha256:d54dbaadeb1425b7d6a66bf44bee2bb2b899fe3e8850b8e94cfb9c904dcb46d0",
-                "sha256:ddb52d088889f5596bc4d1de981f2eca106b58243b6679e4782f3ba5096fd645",
-                "sha256:ed9d65594948678827f4ff0e7ae23344e2f2b4cabbca057ccaed3118fdc392ca",
-                "sha256:fab31f48282ebf54dd69f6663cd2d9800096bad1bb67bbc9c9ac84eb77b41972"
+                "sha256:038f4e9d6ef10e1f3fe82addc3a14735c299866eb10f2c77c090410904828312",
+                "sha256:06ffdcaaf81e2a3b1b50c3ac6842cfb13df2d8b737d61f64643ed61da7389cde",
+                "sha256:0e71ce9c7cbc20f6f8b860107ce15114da26e8675238b4b82b7e7cd37ca0c087",
+                "sha256:1eec963fe9ffc827442c2e9333227c4d49749a44e592f305398c1db5c1563393",
+                "sha256:2754c85b2287333f9719db7f23fb7e357f436deed512db3417a02bf6f2830aa5",
+                "sha256:2db429090b98045d71218a9ba913cc9b3fe78e0ba0b6b647d8748bc6d5a44080",
+                "sha256:39b7e3b71bcb1fe46397185d6c1a5db1c441e71c23c91a31e7ad8cc3f7305f9a",
+                "sha256:3cbd734e1aefc7c5080e6b6973fe062f97c26a1cdf1a991037ca196ce1c8f427",
+                "sha256:40556bea1ef26ef54bc678d00cf138a63069144a0b5f3a436eecd8f3468b903e",
+                "sha256:48f273836e19901ba2beecd919f7b352f09310ce67c762f6e53bc6b81cacf1f0",
+                "sha256:49ec0b1361da328da9bb7f1a162836028e72556356adeb53342f8fae6b450d47",
+                "sha256:4e6198675a6f9d333774671bd536668680eea78e2e81c0b19e57224f58d17f37",
+                "sha256:5beaeb091071625e83f5905192d8aecde65ba2f26f8b6719845bbf586f7a04a1",
+                "sha256:5ff3e4e4cf7592d36541edec434e09fb8ab9ba6b47608c4ffe30c9038d301897",
+                "sha256:62214d2954377fcf3f31ec867dd4e436df80121e7a32947a0b3244f58f45e455",
+                "sha256:7be1b88c23cfac46e06404582215a917017cd2edaa2e4d40abe6aaff5458f24b",
+                "sha256:8fac72b9688176922f9f54fda1ba5f7ffd28cbeb9aad282760186e8ceba9139a",
+                "sha256:90a297330f608adeb4d2e9786c6fda395d3150739deb3d42a86d9a4c2d15bc1d",
+                "sha256:a2a47449093dcf70babc930beba2ca0423cb7df2fa5fd76be5260703d67fa574",
+                "sha256:ae19ac105cf7ce8c205a46166992fdec88081d6e783ab6e38ecfbe45729f3c39",
+                "sha256:ae426e3a52842c6b6d77d00f906b6031c8c2cfdfabd6af7511bb4bc9a68d720e",
+                "sha256:cbdb0b3db99dd1d5f69d31b4234367d55475add31df4d84a3bd690ef017b55e2",
+                "sha256:cdf24c1b9bbeb4936456b42ac5bd32c60bb194a344951acb6bfb0cddee5439a4",
+                "sha256:d14701a12417930392cd3898e9646cf5670c190b933625ebe7511b1f7d7b8736",
+                "sha256:d177fe1ff47cc235942d628d41ee5b1c6930d8f009f1a451c39b5411e8d0d4cf",
+                "sha256:d5bf9c863ba4717b3917b5227463ee06860fc43931dc9026747de416c0a10fee",
+                "sha256:dd968a174aa82f3341a615a033fa6a8169e9320cbb46130686562db132d7f1f0",
+                "sha256:f0ed4483c258fb23150e31b91ea7d25ff8495dba108aea0b0d4206a777705350",
+                "sha256:f18c3ed484eeeaa43a0d45dc2efb4d00fc6542ccdcfa2c45d7b635096a2ae534",
+                "sha256:f1d2108e770907540b5248977e4cff9ffaf0f73d0d13445ee938df06ca7579c6",
+                "sha256:f3ec00f023d84526381ad0c0f2cff982852d035c921bbf8ceb994f4886c00c64",
+                "sha256:f74429a07fedb36a03c159332b914e6de757176064f9fed94b5f79ebac07d913",
+                "sha256:fec42690a2eb646b384eafb021c425fab48991587edb412d4db77acc358b27ce"
             ],
             "markers": "python_version >= '3.6'",
-            "version": "==0.24.1"
+            "version": "==0.24.2"
         },
         "scipy": {
             "hashes": [
-                "sha256:03f1fd3574d544456325dae502facdf5c9f81cbfe12808a5e67a737613b7ba8c",
-                "sha256:0c81ea1a95b4c9e0a8424cf9484b7b8fa7ef57169d7bcc0dfcfc23e3d7c81a12",
-                "sha256:1fba8a214c89b995e3721670e66f7053da82e7e5d0fe6b31d8e4b19922a9315e",
-                "sha256:37f4c2fb904c0ba54163e03993ce3544c9c5cde104bcf90614f17d85bdfbb431",
-                "sha256:50e5bcd9d45262725e652611bb104ac0919fd25ecb78c22f5282afabd0b2e189",
-                "sha256:6ca1058cb5bd45388041a7c3c11c4b2bd58867ac9db71db912501df77be2c4a4",
-                "sha256:77f7a057724545b7e097bfdca5c6006bed8580768cd6621bb1330aedf49afba5",
-                "sha256:816951e73d253a41fa2fd5f956f8e8d9ac94148a9a2039e7db56994520582bf2",
-                "sha256:96620240b393d155097618bcd6935d7578e85959e55e3105490bbbf2f594c7ad",
-                "sha256:993c86513272bc84c451349b10ee4376652ab21f312b0554fdee831d593b6c02",
-                "sha256:adf7cee8e5c92b05f2252af498f77c7214a2296d009fc5478fc432c2f8fb953b",
-                "sha256:bc52d4d70863141bb7e2f8fd4d98e41d77375606cde50af65f1243ce2d7853e8",
-                "sha256:c1d3f771c19af00e1a36f749bd0a0690cc64632783383bc68f77587358feb5a4",
-                "sha256:d744657c27c128e357de2f0fd532c09c84cd6e4933e8232895a872e67059ac37",
-                "sha256:e3e9742bad925c421d39e699daa8d396c57535582cba90017d17f926b61c1552",
-                "sha256:e547f84cd52343ac2d56df0ab08d3e9cc202338e7d09fafe286d6c069ddacb31",
-                "sha256:e89091e6a8e211269e23f049473b2fde0c0e5ae0dd5bd276c3fc91b97da83480",
-                "sha256:e9da33e21c9bc1b92c20b5328adb13e5f193b924c9b969cd700c8908f315aa59",
-                "sha256:ffdfb09315896c6e9ac739bb6e13a19255b698c24e6b28314426fd40a1180822"
+                "sha256:01b38dec7e9f897d4db04f8de4e20f0f5be3feac98468188a0f47a991b796055",
+                "sha256:10dbcc7de03b8d635a1031cb18fd3eaa997969b64fdf78f99f19ac163a825445",
+                "sha256:19aeac1ad3e57338723f4657ac8520f41714804568f2e30bd547d684d72c392e",
+                "sha256:1b21c6e0dc97b1762590b70dee0daddb291271be0580384d39f02c480b78290a",
+                "sha256:1caade0ede6967cc675e235c41451f9fb89ae34319ddf4740194094ab736b88d",
+                "sha256:23995dfcf269ec3735e5a8c80cfceaf384369a47699df111a6246b83a55da582",
+                "sha256:2a799714bf1f791fb2650d73222b248d18d53fd40d6af2df2c898db048189606",
+                "sha256:3274ce145b5dc416c49c0cf8b6119f787f0965cd35e22058fe1932c09fe15d77",
+                "sha256:33d1677d46111cfa1c84b87472a0274dde9ef4a7ef2e1f155f012f5f1e995d8f",
+                "sha256:44d452850f77e65e25b1eb1ac01e25770323a782bfe3a1a3e43847ad4266d93d",
+                "sha256:9e3302149a369697c6aaea18b430b216e3c88f9a61b62869f6104881e5f9ef85",
+                "sha256:a75b014d3294fce26852a9d04ea27b5671d86736beb34acdfc05859246260707",
+                "sha256:ad7269254de06743fb4768f658753de47d8b54e4672c5ebe8612a007a088bd48",
+                "sha256:b30280fbc1fd8082ac822994a98632111810311a9ece71a0e48f739df3c555a2",
+                "sha256:b79104878003487e2b4639a20b9092b02e1bad07fc4cf924b495cf413748a777",
+                "sha256:d449d40e830366b4c612692ad19fbebb722b6b847f78a7b701b1e0d6cda3cc13",
+                "sha256:d647757373985207af3343301d89fe738d5a294435a4f2aafb04c13b4388c896",
+                "sha256:f68eb46b86b2c246af99fcaa6f6e37c7a7a413e1084a794990b877f2ff71f7b6",
+                "sha256:fdf606341cd798530b05705c87779606fcdfaf768a8129c348ea94441da15b04"
             ],
             "markers": "python_version < '3.10' and python_version >= '3.7'",
-            "version": "==1.6.2"
+            "version": "==1.6.3"
         },
         "simplejson": {
             "hashes": [
@@ -513,56 +496,56 @@
         },
         "sqlalchemy": {
             "hashes": [
-                "sha256:013b659efe02f0f58e7f759602584899c921c178c6a972978f16460dcdd782d5",
-                "sha256:193c3ca465fbc68de071995a461ab535466f041089d372ee6a6f0aae7b9307e6",
-                "sha256:2071ee6cd9390a9527a80ef03458fb58e0166bb299db2c62f9d688b6772d76a1",
-                "sha256:21becd8b45ec70b703239cf915104e47889c2aad96d0f68f597b9b547cbfd787",
-                "sha256:2713b338d9c54d2c3c7ff4f7786a40a5ca85013c8ccea00327b034d42598e22e",
-                "sha256:2a042c27b1a32a87f4cead53bcdd28999324992650896094368a595165b31d97",
-                "sha256:2e65c1146f5b4151cc6e553d9847299c97f53640d94ba88b1c534e15cdc6ac38",
-                "sha256:345c201324066b789804411f07eea750e9f29872be052eba221ce76add647d50",
-                "sha256:360a771b538463053383fb6ff7aceffb595248d7059bb9e003bf70562a66510d",
-                "sha256:432e98e6fe0d24e8181eb4177e59cba9f8831dcaf272a0d2de75bc8b933952a0",
-                "sha256:4387ebd5ae8bc2c716dbfc1ece769c867307eeecc192e72a4d2e7fa0fc092646",
-                "sha256:43fef20dd1024409375cc646a4b5afaffb62f6488e41588cde2a1ed2e9432b5b",
-                "sha256:4d71ee83441826fb48771e58cef51191500a87734b4acb6b698ca018479395bd",
-                "sha256:4eeff8b12c7d22be4de98721bba5a042875f4365e9fd20dc3916eec474ccb81e",
-                "sha256:534c71caa87c7fdb136ce5073fb42b732a4eb390946f503d8e1d7ce6a4a79100",
-                "sha256:66467123c220689d55c6d51fdf88f7b0b62b8078823c5f6c0297ab47c22003d7",
-                "sha256:6c4af3aceeff6a0e2bd3657d8b25714a9f7c7c606e7ec52029284973094f84c1",
-                "sha256:7d252dea33c1ee07b3d702fb4962963996ea40e5a2615dbe7646ccabd851ac76",
-                "sha256:86a7321636f851c6e8009901c5d67e97d82b86ee8c6f28a476691c41c3d71a95",
-                "sha256:88d75ea6b4330a6f5596a49904f21762ff89ca763db065d63b815ad8c3d68952",
-                "sha256:8a296bbf367867aee2ea8d5b391cb04fbdb3ca7277cd1649d9e8114620f3b090",
-                "sha256:933427a5474e014d01bac93224cd4e2bc7bbc7ce531d0bd7e55e4f940cc8ce0d",
-                "sha256:93f6fe67a76d7fa1cca3b9febb36e9f2dd76055230e2bfa317969532f34c03ab",
-                "sha256:a687e552ab4ffedcf3ec3bd5256ab3e753b4f605b467e9fa39690b2dadb5f607",
-                "sha256:a69787f7fc87b84df7e2f27158476cdf39a79ebb95af1d6f696e474724af9ebe",
-                "sha256:a76c10b467f7d385e4cffe2185d975336acf0dbf24ed702c46207df0fb64055e",
-                "sha256:b093bd6efb49332021714bed5752e784a34ae6d6896ec56ffdc32cc83275a215",
-                "sha256:bdeb300bb9adc02f98957cd0cf0c38d641bdd435b0927e39870a772e0a750bc0",
-                "sha256:c719f0058951457a7761bb69c2e47781a9989ab4819b7a30b6b39141ad013a5f",
-                "sha256:cadb58aeadd9916e79e8f99a49d0c0a9e61ae2b24469c2b304a0699e41a25e59",
-                "sha256:cc3c0d87b11ae1dd1ccbd6fc7875a290b3f73b771254180c2e7b19c2aec7379b",
-                "sha256:d42b8e2bffdf9e01d66cf46472b938493b854ea790a0fbe2e2e42624fc253b33",
-                "sha256:d7684e0598acfbfb5110bea482d8c5e94f52001d6d66b5558177f41f49fb5930",
-                "sha256:e5267cd2e51ddefbe10bb182c36ba41cdaa51c83a0fdfa63ed8cbe89cbcf0f33"
+                "sha256:0140f6dac2659fa6783e7029085ab0447d8eb23cf4d831fb907588d27ba158f7",
+                "sha256:034b42a6a59bf4ddc57e5a38a9dbac83ccd94c0b565ba91dba4ff58149706028",
+                "sha256:03a503ecff0cc2be3ad4dafd220eaff13721edb11c191670b7662932fb0a5c3a",
+                "sha256:069de3a701d33709236efe0d06f38846b738b19c63d45cc47f54590982ba7802",
+                "sha256:1735e06a3d5b0793d5ee2d952df8a5c63edaff6383c2210c9b5c93dc2ea4c315",
+                "sha256:19633df6be629200ff3c026f2837e1dd17908fb1bcea860290a5a45e6fa5148e",
+                "sha256:1e14fa32969badef9c309f55352e5c46f321bd29f7c600556caacdaa3eddfcf6",
+                "sha256:31e941d6db8b026bc63e46ef71e877913f128bd44260b90c645432626b7f9a47",
+                "sha256:452c4e002be727cb6f929dbd32bbc666a0921b86555b8af09709060ed3954bd3",
+                "sha256:45a720029756800628359192630fffdc9660ab6f27f0409bd24d9e09d75d6c18",
+                "sha256:4a2e7f037d3ca818d6d0490e3323fd451545f580df30d62b698da2f247015a34",
+                "sha256:4a7d4da2acf6d5d068fb41c48950827c49c3c68bfb46a1da45ea8fbf7ed4b471",
+                "sha256:4ad4044eb86fbcbdff2106e44f479fbdac703d77860b3e19988c8a8786e73061",
+                "sha256:4f631edf45a943738fa77612e85fc5c5d3fb637c4f5a530f7eedd1a7cd7a70a7",
+                "sha256:6389b10e23329dc8b5600c1a84e3da2628d0f437d8a5cd05aefd1470ec571dd1",
+                "sha256:6ebd58e73b7bd902688c0bb8dbabb0c36b756f02cc7b27ad5efa2f380c611f95",
+                "sha256:7180830ea1082b96b94884bc352b274e29b45151b6ee911bf1fd79cba2de659b",
+                "sha256:789be639501445d85fd4ca41d04f0f5c6cbb6deb0c6826aaa6f22774fe84ef94",
+                "sha256:7d89add44938ea4f52c7641d5805c9e154fed4381e874ef3221483eeb191a96d",
+                "sha256:842b0d4698381aac047f8ae57409c90b7e63ebabf5bc02814ddc8eaefd13499e",
+                "sha256:8f96d4b6a49d3f0f109365bb6303ae5d266d3f90280ca68cf8b2c46032491038",
+                "sha256:961b089e64c2ad29ad367487dd3ba1aa3eeba56bc82037ce91732baaa0f6ca90",
+                "sha256:96de1d4a2e05d4a017087cb29cd6a8ebfeecfd0e9f872880b1a589f011c1c02e",
+                "sha256:98214f04802a3fc740038744d8981a8f2fdca710f791ca125fc4792737d9f3a7",
+                "sha256:9cf94161cb55507cee147bf8abcfd3c076b353ad18743296764dd81108ea74f8",
+                "sha256:9fdf0713166f33e5e6ea98cf59deb305cb323131277f6880de6c509f468076f8",
+                "sha256:a41ab83ecfadf38a47bdfaf4e488f71579df47a711e1ab1dce30d34c7c25bd00",
+                "sha256:ac14fee167653ec6dee32d6aa4d501d90ae1bfbbc3eb5816940bccf227f0d617",
+                "sha256:b8b7d66ee8b8ac272adce0af1342a60854f0d89686e6d3318127a6a82a2f765c",
+                "sha256:bb1072fdf48ba870c0fe81bee8babe4ba2f096fb56bb4f3e0c2386a7626e405c",
+                "sha256:cd823071b97c1a6ac3af9e43b5d861126a1304033dcd18dfe354a02ec45642fe",
+                "sha256:d08173144aebdf30c21a331b532db16535cfa83deed12e8703fa6c67c0894ffc",
+                "sha256:e7d76312e904aa4ea221a92c0bc2e299ad46e4580e2d72ca1f7e6d31dce5bfab",
+                "sha256:f772e4428d413c0affe2a34836278fbe9df9a9c0940705860c2d3a4b50af1a66"
             ],
             "index": "pypi",
-            "version": "==1.4.6"
+            "version": "==1.4.11"
         },
         "sqlalchemy-utils": {
             "hashes": [
-                "sha256:fb66e9956e41340011b70b80f898fde6064ec1817af77199ee21ace71d7d6ab0"
+                "sha256:c7bec2c982b31ec6133ba519f73f07653bbb7e7b3c23836bb8d9133045386b68"
             ],
-            "version": "==0.36.8"
+            "version": "==0.37.0"
         },
         "sqlparse": {
             "hashes": [
                 "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
                 "sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
             ],
-            "index": "pypi",
+            "markers": "python_version >= '3.5'",
             "version": "==0.4.1"
         },
         "threadpoolctl": {
@@ -581,15 +564,6 @@
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==4.60.0"
         },
-        "typing-extensions": {
-            "hashes": [
-                "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
-                "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
-                "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
-            ],
-            "markers": "python_version < '3.8'",
-            "version": "==3.7.4.3"
-        },
         "urllib3": {
             "hashes": [
                 "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df",
@@ -597,14 +571,6 @@
             ],
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
             "version": "==1.26.4"
-        },
-        "zipp": {
-            "hashes": [
-                "sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76",
-                "sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098"
-            ],
-            "markers": "python_version >= '3.6'",
-            "version": "==3.4.1"
         }
     },
     "develop": {}

+ 35 - 14
cdplib/db_handlers/InfluxdbHandler.py

@@ -82,8 +82,15 @@ class InfluxdbHandler:
         try:
             # result of the query is a defaultdict
             result = self.client.query(query)
-
-            return list(result.values())[0]
+            
+            if len(list(result.values())) > 0:
+
+                return list(result.values())[0]
+            
+            else:
+                
+                return pd.DataFrame()
+            
         except Exception as e:
             self._logger.log_and_raise_error(
                 ("Could not query to dataframe. "
@@ -91,8 +98,8 @@ class InfluxdbHandler:
 
     def query_between_dates(self, columns: str,
                             tables: str,
-                            start: str,
-                            stop: str) -> pd.DataFrame:
+                            start: str = None,
+                            stop: str = None) -> pd.DataFrame:
         """
         :param columns: DESCRIPTION
         :type columns: str
@@ -106,21 +113,35 @@ class InfluxdbHandler:
         :rtype: TYPE
 
         """
-        if not isinstance(start, str):
+        if (start is not None) and (not isinstance(start, str)):
             start = datetime.strftime(start, format="%Y-%m-%dT%H:%M:%SZ")
 
-        if not isinstance(stop, str):
+        if (stop is not None) and (not isinstance(stop, str)):
             stop = datetime.strftime(stop, format="%Y-%m-%dT%H:%M:%SZ")
-
-        query = 'SELECT ' +\
-                columns +\
-                ' FROM \"' +\
-                tables +\
-                '\" WHERE time > \'' +\
+            
+        query = 'SELECT ' + columns + ' FROM \"' + tables
+            
+        if (start is not None) and (stop is not None):
+            
+             query += '\" WHERE time > \'' +\
                 str(start) +\
                 '\' AND time  < \'' +\
                 str(stop) +\
                 '\' tz(\'Europe/Berlin\');'
+                
+        elif start is not None:
+            
+            query += '\" WHERE time >= \'' + str(start) +\
+                '\' tz(\'Europe/Berlin\');'
+            
+        elif stop is not None:
+            
+            query += '\" WHERE time <= \'' + str(stop) +\
+                '\' tz(\'Europe/Berlin\');'
+                
+        else:
+            query += ';'
+            
 
         return self.query_to_dataframe(query)
 
@@ -147,12 +168,12 @@ class InfluxdbHandler:
         """
         
         measurement_columns = [c for c in dataframe.columns
-                               if c not in tag_columns]
+                               if c not in (tag_columns or [])]
         
         for column in measurement_columns:
             try:
                 self.client.write_points(
-                    dataframe=dataframe[[column] + tag_columns],
+                    dataframe=dataframe[[column] + (tag_columns or [])],
                     measurement=column,
                     tag_columns=tag_columns,
                     protocol='line',

+ 173 - 0
cdplib/fine_tuning/FineTunedClassiferCV.py

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Apr 23 08:51:53 2020
+
+@author: tanya
+
+@description: class for fine-tuning a sklearn classifier
+(optimizing the probability threshold)
+"""
+
+import pandas as pd
+import numpy as np
+
+from typing import Callable
+
+from sklearn.base import (BaseEstimator, ClassifierMixin,
+                          clone, MetaEstimatorMixin)
+
+from cdplib.log import Log
+
+from cdplib.utils.TyperConverter import TypeConverter
+
+
+class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
+                            MetaEstimatorMixin):
+    """
+    Probability threshold tuning for a given estimator.
+    Overrides the method predict of the given sklearn classifer
+    and returns predictions with the optimal value of
+    the probability threshold.
+
+    An object of this class can be passed to an sklearn Pipeline
+    """
+    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
+                 cv=None, threshold_step: float = 0.1):
+        """
+        """
+        self.estimator = estimator
+
+        self.is_fitted = False
+
+        self.greater_is_better = greater_is_better
+
+        if cv is None:
+            self.cv = ...
+        else:
+            self.cv = cv
+
+        self.cost_func = cost_func
+
+        self.threshold_step = threshold_step
+
+        self.optimal_threshold = 0.5
+
+        self._logger = Log("FineTunedClassifyCV")
+
+    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
+                            proba_pred: (pd.DataFrame, np.array)):
+        '''
+        '''
+        costs = {}
+
+        for t in np.arange(self.threshold_step, 1, self.threshold_step):
+            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
+
+        if self.greater_is_better:
+            return max(costs, key=costs.get)
+        else:
+            return min(costs, key=costs.get)
+
+    def fit(self, X: (pd.DataFrame, np.array),
+            y: (pd.DataFrame, np.array) = None,
+            **fit_args):
+        """
+        """
+        X = TypeConverter().convert_to_ndarray(X)
+        if y is not None:
+            y = TypeConverter().convert_to_ndarray(X)
+
+        optimal_thrs_per_fold = []
+
+        for train_inds, val_inds in self.cv:
+            X_train, X_val = X[train_inds], X[val_inds]
+
+            if y is not None:
+                y_train, y_val = y[train_inds], y[val_inds]
+            else:
+                y_train, y_val = None, None
+
+            estimator = clone(fine_tuned_clf.estimator)
+
+            estimator.fit(X_train, y_train, **fit_args)
+
+            proba_pred = estimator.predict_proba(X_val)
+
+            optimal_thr = self._get_best_threshold(y_val, proba_pred)
+
+            optimal_thrs_per_fold.append(optimal_thr)
+
+        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
+
+        self.estimator.fit(X, **fit_args)
+
+    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
+        """
+        """
+        if self.is_fitted:
+
+            proba_pred = self.estimator.predict_proba(X)
+
+            return (proba_pred >= self.optimal_threshold).astype(int)
+
+        else:
+            self._logger.warn("You should fit first")
+
+    def get_params(self):
+        """
+        """
+        params = self.estimator.get_params()
+
+        params.update({"cv": self.cv, "cost_func": self.cost_func})
+
+        return params
+
+    def set_params(self, **params: dict):
+        """
+        """
+        for param in params:
+            if param == "cv":
+                self.cv = params[param]
+                params.pop(param)
+
+            elif param == "cost_func":
+                self.cost_func = params[param]
+                params.pop(param)
+
+        self.estimator.set_params(**params)
+
+
+if __name__ == "__main__":
+    # test
+    from sklearn.datasets import load_iris
+    from sklearn.metrics import accuracy_score
+    import gc
+    from xgboost import XGBRFClassifier
+
+    data = load_iris()
+    X, y = data["data"], data["target"]
+    y = (y==1).astype(int)
+    del data
+    gc.collect()
+
+    # make a custom cv object
+    val_len = len(X)//10
+    split_inds = range(len(X)//2, len(X), val_len)
+
+    cv = []
+
+    for i in split_inds:
+        train_inds = list(range(i))
+        val_inds = list(range(i, i + val_len))
+        cv.append((train_inds, val_inds))
+
+    clf = XGBRFClassifier()
+
+    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
+                                           cv=cv,
+                                           greater_is_better=True,
+                                           cost_func=accuracy_score)
+
+    fine_tuned_clf.fit(X=X, y=y)
+

+ 1 - 0
cdplib/fine_tuning/__init__.py

@@ -0,0 +1 @@
+from .FineTunedClassiferCV import *

+ 384 - 0
cdplib/gridsearch/GridSearchPipelineSelector.py

@@ -0,0 +1,384 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 14:15:17 2020
+
+@author: tanya
+@description:a class for selecting a machine learning
+ pipeline from a deterministic space of parameter distributions
+ over multiple pipelines.
+ The selection is though in such a way that a Trials object is being
+ maintained during the tuning process from which one can retrieve
+ the best pipeline so far as well as the entire tuning history
+ if needed.
+"""
+
+import os
+import datetime
+import numpy as np
+from copy import deepcopy
+from itertools import product
+from collections import ChainMap
+from sklearn.pipeline import Pipeline
+from typing import Callable, Optional, Literal, Dict, Union, List
+from cdplib.log import Log
+
+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
+
+
+class GridSearchPipelineSelector(PipelineSelector):
+    """
+    A class for selecting a machine learning
+     pipeline from a deterministic space of parameter distributions
+     over multiple pipelines.
+     The selection is though in such a way that a Trials object is being
+     maintained during the tuning process from which one can retrieve
+     the best pipeline so far as well as the entire tuning history
+     if needed.
+    """
+    def __init__(self,
+                 cost_func: Union[Callable, str],
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: Optional[int] = None,
+                 cross_validation_needs_scorer: bool = True,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"
+                 ):
+        """
+        ::param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from scratch.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
+
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to save
+            of the form {"metric_name": metric} where metric is a Callable.
+
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+        try:
+
+            super().__init__(cost_func=cost_func,
+                             greater_is_better=greater_is_better,
+                             trials_path=trials_path,
+                             backup_trials_freq=backup_trials_freq,
+                             cross_validation_needs_scorer=
+                                 cross_validation_needs_scorer,
+                             cross_val_averaging_func=cross_val_averaging_func,
+                             additional_metrics=additional_metrics,
+                             strategy_name=strategy_name,
+                             stdout_log_level=stdout_log_level)
+
+            self._logger = Log("GridsearchPipelineSelector: ",
+                               stdout_log_level=stdout_log_level)
+
+            self._trials = self._trials or []
+
+        except Exception as e:
+            err = "Failed initialization. Exit with error: {}".format(e)
+
+            self._logger.log_and_raise_error(err)
+
+    def run_trials(self) -> None:
+        """
+        """
+        try:
+            assert(self.attached_space),\
+                "Parameter distribution space must be attached"
+                
+            # XXX Tanya: if the list of values is empty
+            # in the space element, remove it
+
+            done_trial_ids = [{"name": trial["name"],
+                               "params": trial["params"],
+                               "status": trial["status"]}
+                              for trial in self._trials]
+
+            # list (generator) of (flattened) dictionaries
+            # with all different combinations of
+            # parameters for different pipelines
+            # from the space definition.
+            space_unfolded = ({"name": param_dist["name"],
+                               "pipeline": param_dist["pipeline"],
+                               "params": param_set}
+                              for param_dist in self._space
+                              for param_set in
+                              (dict(ChainMap(*tup)) for tup in
+                               product(*[[{k: v} for v in
+                                          param_dist["params"][k]]
+                                         for k in param_dist["params"]])))
+
+            for space_element in space_unfolded:
+
+                # uniquely identifies the current space element
+                trial_id = {"name": space_element["name"],
+                            "params": space_element["params"],
+                            "status": 'ok'}
+
+                # verify if the current pipline/parameters
+                # were already tested before
+                if trial_id in done_trial_ids:
+                    continue
+
+                result = self._objective(space_element)
+
+                pipeline = deepcopy(space_element["pipeline"])
+                
+                pipeline = pipeline.set_params(**space_element["params"])
+
+                trial = {"name": space_element["name"],
+                         "params": space_element["params"],
+                         "pipeline": pipeline}
+
+                trial.update(result)
+
+                self._trials.append(trial)
+
+            self.finished_tuning = True
+
+            self.total_tuning_time = datetime.datetime.today()\
+                - self.start_tuning_time
+
+            self._backup_trials()
+
+        except Exception as e:
+            err = "Failed to run trials. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def number_of_trials(self) -> Union[int, None]:
+        """
+        Number of trials already run in the current trials object
+        """
+        try:
+            return len(self._trials)
+
+        except Exception as e:
+            err = ("Failed to retrieve the number of trials. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial(self) -> Union[dict, None]:
+        """
+        """
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return max(self._trials, key=lambda x: x["score"])
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score(self) -> Union[float, None]:
+        '''
+        '''
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return self.best_trial["score"]
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score_variance(self) -> Union[float, None]:
+        '''
+        '''
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return self.best_trial["score_variance"]
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
+        '''
+        '''
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return self.best_trial["pipeline"]
+
+        except Exception as e:
+            err = ("Could not retrieve the best trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines(self, n: int)\
+            -> Union[List[Pipeline], None]:
+        """
+        N best pipelines with corresponding
+        best hyperparameters
+        """
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return [trial["pipeline"] for trial in
+                    sorted(self._trials, key=lambda x: x["score"],
+                           reverse=True)[:n]]
+
+        except Exception as e:
+            err = ("Failed to retrieve n best trials. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
+            -> Union[Dict[str, List[Pipeline]], None]:
+        """
+        If the hyperparameter search is done over multiple
+        pipelines, then returns n different pipeline-types
+        with corresponding hyperparameters
+        """
+        try:
+            assert(len(self._trials) > 0),\
+                ("Trials object is empty. "
+                 "Call run_trials method.")
+
+            return pd.DataFrame(self._trials)\
+                     .sort_values(by=["name", "score"],
+                                  ascending=False)\
+                     .groupby("name")\
+                     .head(n)\
+                     .groupby("name")["pipeline"]\
+                     .apply(lambda x: list(x))\
+                     .to_dict()
+
+        except Exception as e:
+            err = ("Failed to retrieve n best trials of each type."
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def trials_to_excel(self, path: str) -> None:
+        """
+        Trials object in the shape of table written to excel,
+        should contain the run number, pipeline (as str),
+        hyperparamters (as str), self.best_result (see self._objective method)
+        as well as additional information configured
+        through self.save_result method.
+        """
+        try:
+            pd.DataFrame(self._trials).to_excel(path)
+
+        except Exception as e:
+            err = ("Failed to write trials to excel. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+
+if __name__ == "__main__":
+
+    # elementary example
+
+    from sklearn.datasets import load_breast_cancer
+    from sklearn.metrics import accuracy_score, precision_score
+    from cdplib.gridsearch.space_sample import space
+    from cdplib.db_handlers import MongodbHandler
+    import pickle
+    import pandas as pd
+
+    trials_path = "gridsearch_trials_TEST.pkl"
+    additional_metrics = {"precision": precision_score}
+    strategy_name = "strategy_1"
+    data_path = "data_TEST.h5"
+    cv_path = "cv_TEST.pkl"
+    collection_name = 'TEST_' + strategy_name
+
+    logger = Log("GridSearchPipelineSelector__TEST:")
+
+    logger.info("Start test")
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
+    pd.Series(y).to_hdf(data_path, key="y_train")
+
+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
+
+    pickle.dump(cv, open(cv_path, "wb"))
+
+    gs = GridSearchPipelineSelector(cost_func=accuracy_score,
+                                    greater_is_better=True,
+                                    trials_path=trials_path,
+                                    additional_metrics=additional_metrics,
+                                    strategy_name=strategy_name,
+                                    stdout_log_level="WARNING")
+
+    gs.attach_space(space=space)
+
+    gs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
+                             cv_pickle_path=cv_path)
+
+    save_method = MongodbHandler().insert_data_into_collection
+    save_kwargs = {'collection_name': collection_name}
+
+    gs.configer_summary_saving(save_method=save_method,
+                               kwargs=save_kwargs)
+
+    gs.run_trials()
+
+    logger.info("Best trial: {}".format(gs.best_trial))
+    logger.info("Total tuning time: {}".format(gs.total_tuning_time))
+
+    for file in [trials_path, data_path, cv_path]:
+        os.remove(file)
+
+    logger.info("End test")
+    
+    # XXX Tanya check warnings

+ 2 - 0
cdplib/gridsearch/__init__.py

@@ -0,0 +1,2 @@
+from .GridSearchPipelineSelector import *
+from .space_sample import *

+ 33 - 0
cdplib/gridsearch/space_sample.py

@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct  5 09:50:24 2020
+
+@author: tanya
+"""
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+
+space = [
+        {"name": "std_scaler_kbest_rf",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("kbest", SelectPercentile()),
+                 ("rf", RandomForestClassifier())]),
+         "params": {"kbest__percentile": [2, 3],
+                    "rf__n_estimators": [10, 20]}},
+
+        {"name": "std_scaler_pca_lr",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("pca", PCA()),
+                 ("lr", LogisticRegression())]),
+         "params": {"lr__C": [0.5, 1],
+                    "pca__n_components": [2, 3]}}
+        ]

+ 499 - 0
cdplib/hyperopt/HyperoptPipelineSelector.py

@@ -0,0 +1,499 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Oct  6 15:04:25 2020
+
+@author: tanya
+@description:a class for selecting a machine learning
+ pipeline from a deterministic space of parameter distributions
+ over multiple pipelines.
+ The selection is though in such a way that a Trials object is being
+ maintained during the tuning process from which one can retrieve
+ the best pipeline so far as well as the entire tuning history
+ if needed.
+"""
+
+import os
+
+import pickle
+
+from copy import deepcopy
+
+import datetime
+
+import pandas as pd
+import numpy as np
+
+from sklearn.pipeline import Pipeline
+
+from hyperopt import fmin, tpe, rand, Trials, space_eval
+
+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector,\
+     SpaceElementType
+
+from typing import Callable, Optional, Literal, Dict, Union, List
+
+
+class HyperoptPipelineSelector(PipelineSelector):
+    """
+    Use this class to perform a search
+    for a machine learning pipeline in a given parameter space.
+    The parameter space can include multiple types of Pipelines
+    (SVM, XGBOOST, random forest, etc),
+    as well as parameter distributions for each pipeline parameter.
+    See example in main for the expected space structure.
+
+    The search can be performed either randomly
+    or with a tree-based algorithm. (Other methods are currently
+    developped by hyperopt creators).
+
+    Attribute trials is responsible for book-keeping parameter
+    combinations that have already been tried out. This attribute
+    is saved to a binary file every n minutes as well as every time
+    a better pipeline was found.
+    """
+    def __init__(self,
+                 cost_func: Union[Callable, str],
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: Optional[int] = None,
+                 cross_validation_needs_scorer: bool = True,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"):
+        """
+        param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from scratch.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
+
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to save
+            of the form {"metric_name": metric} where metric is a Callable.
+
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+
+        try:
+
+            super().__init__(cost_func=cost_func,
+                             greater_is_better=greater_is_better,
+                             trials_path=trials_path,
+                             backup_trials_freq=backup_trials_freq,
+                             cross_validation_needs_scorer=
+                                 cross_validation_needs_scorer,
+                             cross_val_averaging_func=cross_val_averaging_func,
+                             additional_metrics=additional_metrics,
+                             strategy_name=strategy_name,
+                             stdout_log_level=stdout_log_level)
+
+            self._logger = Log("HyperoptPipelineSelector: ",
+                               stdout_log_level=stdout_log_level)
+
+            self._trials = self._trials or Trials()
+
+        except Exception as e:
+            err = "Failed to intialize. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    def run_trials(self,
+                   niter: int,
+                   algo: Literal[tpe.suggest, rand.suggest] = tpe.suggest)\
+            -> None:
+        '''
+        Method performing the search of the best pipeline in the given space.
+        Calls fmin function from the hyperopt library to minimize the output of
+        _objective.
+
+        :params int niter: number of search iterations
+        :param algo: now can only take supported by the hyperopt library.
+            For now these are tpe.suggest for a tree-based bayesian search
+            or rad.suggest for randomized search
+        '''
+        try:
+            self._trials = self._trials or Trials()
+
+            self._logger.info(("Starting {0} iterations of search "
+                               "additional to {1} previous"
+                               .format(niter, len(self._trials.trials))))
+
+            best_trial = fmin(fn=self._objective,
+                              space=self._space,
+                              algo=algo,
+                              trials=self._trials,
+                              max_evals=len(self._trials.trials) + niter)
+
+            self._logger.info(
+                    "Best score is {0} with variance {1}"
+                    .format(
+                     self._trials.best_trial["result"]["score"],
+                     self._trials.best_trial["result"]["score_variance"]))
+
+            self._logger.info(("Finished {0} iterations of search.\n"
+                               "Best parameters are:\n {1} ")
+                              .format(niter,
+                                      space_eval(self._space, best_trial)))
+
+            self.finished_tuning = True
+
+            self.total_tuning_time = datetime.datetime.today()\
+                - self.start_tuning_time
+
+            self._backup_trials()
+
+        except Exception as e:
+            err = ("Failed to select best "
+                   "pipeline! Exit with error: {}").format(e)
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def number_of_trials(self) -> Union[int, None]:
+        """
+        :return: number of trials run so far
+            with the given Trials object
+        """
+
+        try:
+            return len(self._trials.trials)
+
+        except Exception as e:
+            err = ("Failed to retrieve the number of trials. "
+                   "Exit with error {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _get_space_element_from_trial(self, trial: dict)\
+            -> Union[Dict[str, SpaceElementType], None]:
+        """
+        Hyperopt trials object does not contain the space
+             elements that result in the corresponding trials.
+             One has to use the function space_eval from
+             hyperopt to get the space element.
+
+        After retrieving the space element,
+            parameters of the pipeline are set.
+        """
+        try:
+            trial = deepcopy(trial)
+
+            assert(self.attached_space),\
+                "Hyperparameter space not attached."
+
+            space_element = space_eval(self._space,
+                                       {k: v[0] for k, v in
+                                        trial['misc']['vals'].items()
+                                        if len(v) > 0})
+
+            pipeline = deepcopy(space_element["pipeline"])
+            params = deepcopy(space_element["params"])
+            pipeline.set_params(**params)
+
+            space_element["pipeline"] = pipeline
+
+            return space_element
+
+        except Exception as e:
+            err = ("Failed to retrieve a space element from a trial. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _get_space_element_from_index(self, i: int)\
+            -> Union[Dict[str, SpaceElementType], None]:
+        """
+        Gets the space element of shape
+        {"name": NAME, "params": PARAMS, "pipeline": PIPELINE}
+        from the trial number i.
+        """
+        try:
+            assert(len(self._trials.trials) > i),\
+                ("Trials object is not long enough "
+                 "to retrieve index {}".format(i))
+
+            return self._get_space_element_from_trial(self._trials.trials[i])
+
+        except Exception as e:
+            err = ("Failed to get space element from index. "
+                   "Exit with error {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _get_pipeline_from_index(self, i: int) -> Union[Pipeline, None]:
+        """
+        Gets a pipeline with set parameters from the trial number i
+        """
+        try:
+            space_element = self._get_space_element_from_index(i)
+
+            return space_element["pipeline"]
+
+        except Exception as e:
+            err = ("Failed to retrieve pipeline from index. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial(self) -> Union[dict, None]:
+        """
+        :return: dictionary with the summary of the best trial
+            and space element (name, pipeline, params)
+            resulting in the best trial
+        """
+        if len(self._trials.trials) == 0:
+
+            self._logger.log_and_throw_warning("Trials object is empty")
+            return {}
+
+        else:
+
+            try:
+                best_trial = deepcopy(self._trials.best_trial)
+
+                if self.attached_space:
+
+                    space_element = self._get_space_element_from_trial(
+                            best_trial)
+                else:
+                    space_element = {}
+
+                    warn = ("Space is not attached, "
+                            "To included the best pipeline "
+                            "attach the space")
+                    self._logger.log_and_throw_warning(warn)
+
+                best_trial = deepcopy(self._trials.best_trial["result"])
+
+                best_trial.update(space_element)
+
+                return best_trial
+
+            except Exception as e:
+                err = "Failed to retrieve best trial. Exit with error: {}"\
+                    .format(e)
+
+                self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score(self) -> Union[float, None]:
+        """
+        """
+        try:
+            if len(self.best_trial) > 0:
+                return self.best_trial["score"]
+            else:
+                return np.nan
+
+        except Exception as e:
+            err = ("Failed to retrieve best trial score. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_score_variance(self) -> Union[float, None]:
+        """
+        """
+        try:
+            if len(self.best_trial) > 0:
+                return self.best_trial["score_variance"]
+            else:
+                return np.nan
+
+        except Exception as e:
+            err = ("Failed to retrieve best trial score variance. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
+        """
+        """
+        try:
+            if len(self.best_trial) > 0:
+                return self.best_trial["pipeline"]
+            else:
+                return np.nan
+
+        except Exception as e:
+            err = ("Failed to retrieve best trial pipeline. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines(self, n: int)\
+            -> Union[List[Pipeline], None]:
+        """
+        :return: the list of n best pipelines
+        documented in trials
+        """
+        try:
+            if len(self._trials.trials) == 0:
+                return []
+            else:
+                n_best_trials = sorted(self._trials.trials,
+                                       key=lambda x: x["result"]["score"],
+                                       reverse=True)[:n]
+
+                return [self._get_space_element_from_trial(trial)["pipeline"]
+                        for trial in n_best_trials]
+
+        except Exception as e:
+            err = ("Failed to retrieve n best pipelines. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
+            -> Union[Dict[str, List[Pipeline]], None]:
+        """
+        :return: a dictiionry where keys are pipeline names,
+        and values are lists of best pipelines with this name
+        """
+        try:
+            scores = [trial["result"]["score"]
+                      for trial in self._trials.trials]
+
+            names = [self._get_space_element_from_trial(trial)["name"]
+                     for trial in self._trials.trials]
+
+            return pd.DataFrame({"name": names, "score": scores})\
+                     .sort_values(by=["name", "score"], ascending=False)\
+                     .groupby("name")\
+                     .head(n)\
+                     .reset_index()\
+                     .assign(pipeline=lambda x: x["index"]
+                             .apply(self._get_pipeline_from_index))\
+                     .groupby("name")["pipeline"]\
+                     .apply(lambda x: list(x))\
+                     .to_dict()
+
+        except Exception as e:
+            err = ("Failed to get n best pipelines of each type. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def trials_to_excel(self, path: str = None) -> None:
+        """
+        Saves an excel file with pipeline names, scores,
+        parameters, and timestamps.
+        """
+        try:
+            results = [trial["result"] for trial in self._trials.trials]
+
+            space_elements = [self._get_space_element_from_trial(trial)
+                              for trial in self._trials.trials]
+
+            pd.DataFrame([{**result, **space_element}
+                          for result, space_element in
+                          zip(results, space_elements)]).to_excel(path)
+
+        except Exception as e:
+            err = ("Failed to write trials to excel. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+
+if __name__ == '__main__':
+
+    # elementary example
+
+    from sklearn.metrics import roc_auc_score, precision_score
+    from sklearn.datasets import load_breast_cancer
+    from cdplib.log import Log
+    from cdplib.db_handlers import MongodbHandler
+    from cdplib.hyperopt.space_sample import space
+    # from cdplib.hyperopt.composed_space_sample import space
+
+    trials_path = "hyperopt_trials_TEST.pkl"
+    additional_metrics = {"precision": precision_score}
+    strategy_name = "strategy_1"
+    data_path = "data_TEST.h5"
+    cv_path = "cv_TEST.pkl"
+    collection_name = 'TEST_' + strategy_name
+
+    logger = Log("HyperoptPipelineSelector__TEST:")
+
+    logger.info("Start test")
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    pd.DataFrame(X).to_hdf(data_path, key="X_train")
+    pd.Series(y).to_hdf(data_path, key="y_train")
+
+    cv = [(list(range(len(X)//3)), list(range(len(X)//3, len(X)))),
+          (list(range(2*len(X)//3)), list(range(2*len(X)//3, len(X))))]
+
+    pickle.dump(cv, open(cv_path, "wb"))
+
+    hs = HyperoptPipelineSelector(cost_func=roc_auc_score,
+                                  greater_is_better=True,
+                                  trials_path=trials_path,
+                                  additional_metrics=additional_metrics,
+                                  strategy_name=strategy_name,
+                                  stdout_log_level="WARNING")
+
+    hs.attach_space(space=space)
+
+    hs.attach_data_from_hdf5(data_hdf5_store_path=data_path,
+                             cv_pickle_path=cv_path)
+
+    try:
+
+        # TODO: this line causes a pytype to throw not-callable error
+        # works fine with pytype on other class methods.
+        save_method = MongodbHandler().insert_data_into_collection
+        save_kwargs = {'collection_name': collection_name}
+
+        # save_method = pd.DataFrame.to_excel()
+        # save_kwargs = {'excel_writer': "TEST.xlsx"}
+
+        hs.configer_summary_saving(save_method=save_method,
+                                   kwargs=save_kwargs)
+
+        logger.info("Configured summary saving in mongo")
+
+    except Exception as e:
+
+        logger.warning(("Could not configure summary saving in mongo. "
+                        "Exit with error: {}".format(e)))
+
+    hs.run_trials(niter=10)
+
+    logger.info("Best Trial: {}".format(hs.best_trial))
+    logger.info("Total tuning time: {}".format(hs.total_tuning_time))
+
+    for file in [trials_path, data_path, cv_path]:
+        os.remove(file)
+
+    logger.info("End test")

+ 3 - 0
cdplib/hyperopt/__init__.py

@@ -1 +1,4 @@
 from .HyperoptPipelineSelection import *
+from .HyperoptPipelineSelector import *
+from .composed_space_sample import *
+from .space_sample import *

+ 116 - 0
cdplib/hyperopt/composed_space_sample.py

@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul  6 14:02:24 2020
+
+@author: tanya
+@description: space object to pass to HyperoptPipelineSelection class
+"""
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectFromModel, SelectPercentile,\
+    RFE, SelectFpr, f_classif, chi2, mutual_info_classif
+from xgboost import XGBRFClassifier
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from hyperopt import hp
+
+from cdplib.hyperparameter_space_composer.SpaceComposer import SpaceComposer
+
+# TODO: add sample spaces for encoders and transformers
+
+encoders = []
+
+transformers = []
+
+selectors = [
+    {"name": "kbest",
+     "object": SelectPercentile(),
+     "params": {
+       "percentile": 3 + hp.randint("kbest__percentile", 60),
+       "score_func": hp.choice("kbest__score_func",
+                               [f_classif, chi2, mutual_info_classif])}},
+
+    {"name": "fpr",
+     "object": SelectFpr(),
+     "params": {
+        "score_func": hp.choice("fpr__score_func",
+                                [f_classif, chi2]),
+        # mutual_info_classif does not work here
+        "alpha": hp.uniform("fpr__alpha", 0.1, 0.6)}},
+
+    {"name": "rfe_rf",
+     "object":
+         RFE(estimator=RandomForestClassifier(n_jobs=-1, random_state=33)),
+     "params": {
+         "n_features_to_select":
+             3 + hp.randint("rfe_rf__n_features_to_select", 200),
+         "estimator__n_estimators":
+             20 + hp.randint("rfe_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_rf",
+     "object":
+         SelectFromModel(estimator=RandomForestClassifier(n_jobs=-1,
+                                                          random_state=33)),
+     "params": {
+         "estimator__n_estimators":
+             20 + hp.randint("rfm_rf__estimator__n_estimators", 70)}},
+
+    {"name": "rfm_lr",
+     "object":
+         SelectFromModel(estimator=LogisticRegression(n_jobs=-1,
+                                                      random_state=33)),
+     "params": {
+          "estimator__C": hp.uniform("rfm_lr__estimator__C", 0.1, 1000)}},
+
+    {"name": "std_scaler_pca",
+     "object": Pipeline([
+             ("scaler", StandardScaler()),
+             ("pca", PCA(random_state=33))]),
+     "params": {
+        "pca__n_components": hp.uniform("pca__n_components", 0.1, 1),
+       }}
+    ]
+
+models = [
+        {"name": "xgb",
+         "object": XGBRFClassifier(n_jobs=-1, eval_metric="map", seed=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('xgb__n_estimators', 100),
+           "max_depth": 3 + hp.randint("xgb__max_depth", 10),
+           "learning_rate": hp.loguniform("xgb__learning_rate", 0.01, 0.5)
+           }},
+
+        {"name": "rf",
+         "object": RandomForestClassifier(n_jobs=-1, random_state=33),
+         "params": {
+           "n_estimators": 50 + hp.randint('rf__n_estimators', 500),
+           "max_depth": 3 + hp.randint("rf__max_depth", 10),
+           "min_samples_leaf": 1 + hp.randint("rf__min_samples_leaf", 10)
+           }},
+
+        # the default solver does not accept l1 penalty
+        {"name": "lr",
+         "object": LogisticRegression(random_state=33,
+                                      solver='liblinear',
+                                      # n_jobs=-1
+                                      ),
+         "params":  {
+           "penalty": hp.choice("lr__penalty", ["l1", "l2"]),
+           "C": hp.uniform("lr__C", 0.1, 1000)}},
+
+        # svc does not support parallelizaiton, therefore is slow
+        {"name": "svc",
+         "object": SVC(random_state=33),
+         "params": {
+            "kernel": hp.choice("svc__kernel", ["linear", "poly", "rbf"]),
+            "degree": 2 + hp.randint("svc__degree", 3),
+            "C": hp.uniform("svc__C", 0.1, 1000)
+            }}
+        ]
+
+step_list = [encoders, transformers, selectors, models]
+
+space = SpaceComposer().compose_hyperopt_space(step_list)

+ 40 - 0
cdplib/hyperopt/space_sample.py

@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct  5 09:50:24 2020
+
+@author: tanya
+"""
+
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectPercentile
+from sklearn.linear_model import LogisticRegression
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from hyperopt import hp
+import numpy as np
+
+
+space = hp.choice("pipelines", [
+
+        {"name": "std_scaler_kbest_rf",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("kbest", SelectPercentile()),
+                 ("rf", RandomForestClassifier())]),
+         "params": {"kbest__percentile":
+                    hp.choice('kbest__percentile', range(1, 3)),
+                    "rf__n_estimators":
+                    50 + hp.randint('rf__n_estimators', 50)}},
+
+        {"name": "std_scaler_pca_lr",
+         "pipeline": Pipeline([
+                 ("std_scaler", StandardScaler()),
+                 ("pca", PCA()),
+                 ("lr", LogisticRegression())]),
+         "params": {"lr__C":
+                    hp.loguniform("lr__C", np.log(0.01), np.log(0.1)),
+                    "pca__n_components":
+                    1 + hp.randint("pca__n_components", 4)}}
+        ])

+ 85 - 0
cdplib/hyperparameter_space_composer/SpaceComposer.py

@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 13:54:04 2020
+
+@author: tanya
+@description: a class that from a given list of pipeline steps
+ composes a space to be passed in the GridsearchPipelineSelector
+ or HyperoptPipelineSelector classes.
+ A classic list of steps would be: [encoders, transformers, selectors, models]
+"""
+from sklearn.pipeline import Pipeline
+from hyperopt import hp
+from itertools import product
+
+
+class SpaceComposer:
+    """
+    A class that from a given list of pipeline steps
+    composes a space to be passed to GridsearchPipelineSelector
+    or HyperoptPipelineSelector.
+    """
+    def compose_gridsearch_space(self, step_list: list) -> list:
+        """
+        Composes a hyperparameter space for input to the
+        GridsearchPipelineSelector class.
+
+        :param step_list: a classic list of steps would be
+        [encoders, transformers, selectors, models],
+        where, for example, selectors is a list
+        of sklearn feature selectors, each selector given as a dict:
+        for example {"name": "kbest",
+                     "object": SelectPercentile(),
+                     "params": {
+                             "percentile":
+                                 [5, 10, 20],
+                             "score_func":
+                                 [f_classif, chi2, mutual_info_classif]}}
+
+        :return: a list of dictionaries of form
+            {"name": NAME, "pipeline": PIPELINE, "params": PARAMS}
+        """
+        space = []
+
+        step_combinations = product(*[step for step in
+                                      step_list if len(step) > 0])
+
+        for step_combination in step_combinations:
+
+            space_element = {}
+
+            space_element["name"] = "_".join([step["name"]
+                                              for step in step_combination])
+
+            space_element["pipeline"] = Pipeline(
+                    [(step["name"], step["object"])
+                     for step in step_combination])
+
+            space_element["params"] =\
+                {step["name"] + "__" + param_name: param_dist
+                 for step in step_combination
+                 for param_name, param_dist
+                 in step["params"].items()}
+
+            space.append(space_element)
+
+        return space
+
+    def compose_hyperopt_space(self, step_list: list) -> hp.choice:
+        """
+        Composes a hyperopt space from a list of steps.
+        A classic list of steps would be
+        [encoders, transformers, selectors, models],
+        where, for example, selectors is a list
+        of sklearn feature selectors, each selector given as a dict:
+        for example {"name": "kbest",
+                     "object": SelectPercentile(),
+                     "params": {
+                             "percentile":
+                                 3 + hp.randint("kbest__percentile", 200),
+                             "score_func":
+                                 hp.choice("kbest__score_func",
+                                    [f_classif, chi2, mutual_info_classif])}}
+        """
+        return hp.choice("pipelines", self.compose_gridsearch_space(step_list))

+ 1 - 0
cdplib/hyperparameter_space_composer/__init__.py

@@ -0,0 +1 @@
+from .SpaceComposer import *

+ 13 - 1
cdplib/log.py

@@ -7,6 +7,7 @@ import sys
 import os
 import logging
 from datetime import datetime
+import warnings
 
 sys.path.append(os.getcwd())
 
@@ -121,9 +122,20 @@ class Log():
 
         raise Exception(message)
 
+    def log_and_throw_warning(self, message):
+        '''
+        '''
+        self._logger.warning(message)
+
+        warnings.warn(message)
+        
     def log_and_raise_warning(self, message):
         '''
         '''
+        warnings.warn(("This method has been depricated. "
+                       "User log_and_throw_warning instead"),
+                      DeprecationWarning)
+        
         self._logger.warning(message)
 
-        raise Warning(message)
+        warnings.warn(message)

+ 272 - 0
cdplib/ml_validation/CVComposer.py

@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Dec  9 10:27:39 2020
+
+@author: tanya
+"""
+
+from typing import Union, Iterable, Tuple, List, NewType
+import pandas as pd
+import numpy as np
+from itertools import accumulate, repeat, takewhile, chain
+
+from cdplib.log import Log
+
+
+CVType = NewType("CVType", Iterable[Tuple[List]])
+
+DataSetType = NewType("DataSetType",
+                      Union[pd.DataFrame, pd.Series, np.ndarray, List])
+
+
+class CVComposer:
+    """
+    Groups methods for composing cv objects
+    that follow standards from sklearn,
+    these cv objects can be passed to algorithms like gridsearch, etc
+    """
+    def __init__(self):
+        """
+        """
+        self._logger = Log("CVComposer: ")
+
+    def dummy_cv(
+            self,
+            train_set_size: Union[int, None] = None,
+            train_index: Union[pd.Series, np.ndarray, None] = None,
+            test_set_size: Union[int, None] = None,
+            test_index: DataSetType = None) -> CVType:
+        """
+        """
+        assert((train_index is None) != (train_set_size is None)),\
+            "Set train_index or train_set_size"
+
+        assert((test_index is None) != (test_set_size is None)),\
+            "Set train_index or train_set_size"
+
+        train_index = train_index if (train_index is not None)\
+            else list(range(train_set_size))
+
+        test_index = test_index if (test_index is not None)\
+            else list(range(train_set_size, train_set_size + test_set_size))
+
+        return [(train_index, test_index)]
+
+    def dummy_cv_and_concatenated_data_set(
+            self,
+            X_train: DataSetType,
+            X_test: DataSetType,
+            y_train: Union[DataSetType, None] = None,
+            y_test: Union[DataSetType, None] = None)\
+            -> Tuple[DataSetType, DataSetType, CVType]:
+        """
+        """
+        assert((y_test is None) == (y_train is None))
+
+        use_index = (isinstance(X_train, pd.DataFrame) and
+                     isinstance(X_test, pd.DataFrame) and
+                     (len(set(X_train.index) and set(X_test.index)) == 0))
+
+        if use_index:
+
+            cv = self.dummy_cv(train_set_index=X_train.index,
+                               test_set_index=X_test.index)
+
+            X = pd.concat([X_train, X_test], ignore_index=False, axis=0)
+
+        else:
+            cv = self.dummy_cv(train_set_size=len(X_train),
+                               test_set_size=len(X_test))
+
+            X = np.concatenate([X_train, X_test])
+
+        use_target_index = use_index and (
+                    isinstance(y_train, pd.Series) and
+                    isinstance(y_test, pd.Series) and
+                    (X_train.index.equals(y_train.index)) and
+                    (X_test.index.equals(y_test.index)))
+
+        if use_target_index:
+
+            y = pd.concat([y_train, y_test], ignore_index=False, axis=0)
+
+        else:
+
+            y = np.concatenate([y_train, y_test]) if (y_train is not None)\
+                else None
+
+        result_to_np = (
+            (isinstance(X_train, pd.DataFrame) !=
+             isinstance(X_test, pd.DataFrame)) or
+            (isinstance(X_train, pd.DataFrame)) and
+            (len(set(X_train.index) and set(X_test.index)) != 0))
+
+        if result_to_np:
+            self._logger.log_and_throw_warning(
+                    "The concatenated dataframe is converted to numpy")
+
+        return cv, X, y
+
+    def expanding_cv(self, test_proportion: float,
+                     start_train_proportion: float,
+                     step_proportion: float = None,
+                     expanding_test_size: bool = False,
+                     data_set_size: Union[float, None] = None,
+                     index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Union[Iterable[Tuple[List]], None]:
+        """
+        """
+        try:
+            assert((index is None) != (data_set_size is None)),\
+                "Set index or data_set_size"
+
+            index = pd.Series(index) if (index is not None)\
+                else pd.Series(range(data_set_size))
+
+            data_set_size = data_set_size or len(index)
+
+            start_train_size = int(start_train_proportion * data_set_size)
+            step_size = int(step_proportion * data_set_size)
+
+            test_size = int(test_proportion * data_set_size)
+
+            train_inds_set = (list(range(train_size))
+                              for train_size in
+                              takewhile(
+                                      lambda x: x <= data_set_size - test_size,
+                                      accumulate(repeat(start_train_size),
+                                                 lambda x, _: x + step_size)))
+
+            for train_inds in train_inds_set:
+
+                if expanding_test_size:
+
+                    yield (index[train_inds],
+                           index[train_inds[-1] + 1:
+                                 train_inds[-1] + 1
+                                 + int(test_proportion*len(train_inds))])
+
+                else:
+
+                    yield (index[train_inds],
+                           index[train_inds[-1] + 1:
+                                 train_inds[-1] + 1 + test_size])
+
+        except Exception as e:
+            self._logger.log_and_raise_error(("Failed to make expanding cv. "
+                                              "Exit with error: {}".format(e)))
+
+    def sliding_window_cv(
+        self,
+        test_proportion: float,
+        train_proportion: float,
+        step_proportion: float = None,
+        data_set_size: Union[float, None] = None,
+        index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Union[Iterable[Tuple[List]], None]:
+        """
+        """
+        try:
+            assert((index is None) != (data_set_size is None)),\
+                "Set index or data_set_size"
+
+            index = pd.Series(index) if (index is not None)\
+                else pd.Series(range(data_set_size))
+
+            data_set_size = data_set_size or len(index)
+
+            train_size = int(train_proportion * data_set_size)
+            test_size = int(test_proportion * data_set_size)
+            step_size = int(step_proportion * data_set_size)
+
+            train_sizes = takewhile(lambda x: x <= data_set_size - test_size,
+                                    accumulate(repeat(train_size),
+                                               lambda x, _: x + step_size))
+
+            train_starts = takewhile(lambda x: x <= data_set_size
+                                     - train_size - test_size,
+                                     accumulate(repeat(step_size),
+                                                lambda x, _: x + step_size))
+
+            train_starts = chain([0], train_starts)
+
+            train_inds_set = list(range(train_start, train_size)
+                                  for train_start, train_size in
+                                  zip(train_starts, train_sizes))
+
+            cv = ((index[train_inds], index[train_inds[-1] + 1:
+                                            train_inds[-1] + 1 + test_size])
+                  for train_inds in train_inds_set)
+
+            return cv
+
+        except Exception as e:
+            self._logger.log_and_raise_error(
+                    ("Failed to make sliding window cv. "
+                     "Exit with error: {}".format(e)))
+            
+    def nested_expanding_cv(self,
+            test_proportion: float,
+            start_train_proportion: float,
+            step_proportion: float = None,
+            expanding_test_size: bool = False,
+            data_set_size: Union[float, None] = None,
+            index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Iterable[Tuple[List]]:
+        """
+        """
+        logger = Log("make_nested_expanding_cv:")
+    
+        try:
+            cv = self.expanding_cv(test_proportion=test_proportion,
+                                   start_train_proportion=start_train_proportion,
+                                   step_proportion=step_proportion,
+                                   expanding_test_size=expanding_test_size,
+                                   data_set_size=data_set_size,
+                                   index=index)
+    
+            nested_cv = []
+    
+            for train_inds, test_inds in cv:
+    
+                fold_index = train_inds if index is not None\
+                    else None
+    
+                fold_size = len(train_inds) if index is None else None
+    
+                fold_cv = self.expanding_cv(
+                        test_proportion=test_proportion,
+                        start_train_proportion=start_train_proportion,
+                        step_proportion=step_proportion,
+                        expanding_test_size=expanding_test_size,
+                        data_set_size=fold_size,
+                        index=fold_index)
+    
+                nested_cv.append(list(fold_cv))
+    
+            return nested_cv
+    
+        except Exception as e:
+            logger.log_and_raise_error(("Failed to make nested expanding cv. "
+                                        "Exit with error: {}".format(e)))
+    
+    
+    def cv_slice_dataset(self, X, y, train_inds, test_inds)\
+            -> Tuple[Union[pd.DataFrame, np.ndarray],
+                     Union[pd.Series, np.ndarray]]:
+        """
+        """
+        if isinstance(X, pd.DataFrame):
+            X_train = X.loc[train_inds]
+            X_val = X.loc[test_inds]
+        else:
+            X_train = X[train_inds]
+            X_val = X[test_inds]
+    
+        if y is not None:
+            y_train = y[train_inds]
+            y_val = y[test_inds]
+    
+        return X_train, X_val, y_train, y_val
+

+ 2 - 0
cdplib/ml_validation/__init__.py

@@ -0,0 +1,2 @@
+from .cross_validate_with_fine_tuning import *
+from .CVComposer import *

+ 387 - 0
cdplib/ml_validation/cross_validate_with_fine_tuning.py

@@ -0,0 +1,387 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 29 13:58:23 2020
+
+@author: tanya
+
+
+@description:
+
+* Input:
+    - pipeline/hyperparameter space
+    - data_train
+    - cv
+    - cv_folds
+
+* For each pipeline:
+
+    -> Split data_train into folds according to cv
+
+     -> For each fold:
+
+         => get data_train_fold, data_test_fold, cv_fold
+
+         => split data_train_fold into subfolds according to cv_fold
+
+         => For each subfold:
+
+             ==> get data_train_subfold, data_test_subfold
+
+             ==> train pipeline on data_train_subfold
+
+             ==> find best_threshold_subfold on data_test_subfold
+
+        => Find averaged_threshold_fold averaged over best_threshold_subfold
+
+        => train pipeline on data_train_fold
+
+        => find score_fold on data_test_fold with proba_threshold_fold
+
+        => find best_threshold_fold on data_test_fold
+
+    -> find score averaged over score_fold
+
+    -> find averaged_threshold averaged over best_threshold_fold
+
+* choose (pipeline/hyperparameters, threshold) in the space with best score
+
+"""
+
+import sys
+
+import pandas as pd
+import numpy as np
+from itertools import zip_longest
+
+if sys.version_info >= (3, 8):
+    from typing import Callable, Dict, Iterable, Union
+else:
+    from typing_extensions import Callable, Dict, Iterable, Union
+
+from copy import deepcopy
+
+from sklearn.model_selection import StratifiedKFold
+
+from cdplib.log import Log
+
+from cdplib.ml_validation.CVComposer import CVComposer
+
+
+# TODO: write with yield !!!!
+
+def get_optimal_proba_threshold(score_func: Callable,
+                                y_true: Union[pd.Series, np.ndarray],
+                                proba: Union[pd.Series, np.ndarray],
+                                threshold_set: Union[Iterable, None] = None):
+    """
+    """
+    scores = {}
+
+    if threshold_set is None:
+        threshold_set = np.arange(0, 1, 0.1)
+
+    for threshold in threshold_set:
+
+        y_pred = (proba >= threshold).astype(int)
+
+        scores[threshold] = score_func(y_true, y_pred)
+
+    return max(scores, key=scores.get)
+
+
+def cross_validate_with_optimal_threshold(
+        score_func_threshold: Callable,
+        estimator: object,
+        X: Union[pd.DataFrame, np.ndarray],
+        y: Union[pd.Series, np.ndarray, None] = None,
+        scoring: Union[Callable, Dict] = None,
+        cv: Union[Iterable, int, None] = None,
+        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
+        y_val: Union[pd.Series, np.ndarray, None] = None,
+        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
+        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
+        cv_threshold: Union[Iterable, int, None] = None,
+        threshold_set: Union[Iterable, None] = None,
+        scores: Dict = None)-> Dict:
+    """
+    """
+    logger = Log("cross_validate_with_optimal_threshold:")
+
+    X_train = deepcopy(X)
+    y_train = deepcopy(y)
+    X_val = deepcopy(X_val)
+    y_val = deepcopy(y_val)
+    X_val_threshold = deepcopy(X_val_threshold)
+    y_val_threshold = deepcopy(y_val_threshold)
+
+    scores = scores or {"test_threshold": [],
+                        "test_score_threshold": [],
+                        "train_score_threshold": []}
+
+    scoring = scoring or {}
+
+    for metric_name, metric in scoring.items():
+        if "test_" + metric_name not in scores:
+            scores["test_" + metric_name] = []
+            scores["train_" + metric_name] = []
+
+    if cv is None:
+
+        # test score is calculated on X_vals
+
+        assert((X_val is not None) and (y_val is not None)),\
+            "Validation set must be set"
+
+        if cv_threshold is None:
+
+            refit = (X_val_threshold is not None)
+
+            # if a validation set for proba threshold tuning is not given,
+            # we use the validation set on which we calculate the test score
+            # (this might lead to overfitting)
+
+            X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
+            y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
+
+            cv_threshold, X_train, y_train =\
+                CVComposer().dummy_cv_and_concatenated_data_set(
+                    X_train=X_train, 
+                    X_test=X_val_threshold,
+                    y_train=y_train,
+                    y_test=y_val_threshold)
+        else:
+
+            # if cv_threshold is given, we find the optimal threshold
+            # on each fold and output the average value for the threshold
+
+            if (X_val_threshold is not None):
+                logger.log_and_throw_warning((
+                        "X_val_threshold is set "
+                        "but cv_threshold will be used"))
+
+            if isinstance(cv_threshold, int):
+                cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
+                    .split(X=X_train, y=y_train)
+
+            refit = True
+
+        thresholds = []
+
+        for train_inds, val_inds in cv_threshold:
+
+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
+                CVComposer().cv_slice_dataset(
+                    X=X_train,
+                    y=y_train,
+                    train_inds=train_inds,
+                    test_inds=val_inds)
+
+            estimator.fit(X_train_fold, y_train_fold)
+
+            proba_val = estimator.predict_proba(X_val_fold)[:, 1]
+
+            threshold = get_optimal_proba_threshold(
+                score_func=score_func_threshold,
+                y_true=y_val_fold,
+                proba=proba_val)
+
+            thresholds.append(threshold)
+
+        scores["test_threshold"].append(np.mean(thresholds))
+
+        if refit:
+
+            estimator.fit(X_train, y_train)
+
+            proba_val = estimator.predict_proba(X_val)[:, 1]
+
+        proba_train = estimator.predict_proba(X_train)[:, 1]
+
+        pred_train = (proba_train >= threshold)
+        pred_val = (proba_val >= threshold)
+
+        train_score = score_func_threshold(y_train, pred_train)
+        test_score = score_func_threshold(y_val, pred_val)
+
+        for metric_name, metric in scoring.items():
+            scores["train_" + metric_name].append(metric(y_train, pred_train))
+            scores["test_" + metric_name].append(metric(y_val, pred_val))
+
+        scores["train_score_threshold"].append(train_score)
+        scores["test_score_threshold"].append(test_score)
+
+        return scores
+
+    else:
+
+        if isinstance(cv, int):
+            cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
+
+        cv_threshold = cv_threshold or []
+
+        for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
+
+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
+                CVComposer().cv_slice_dataset(
+                    X=X_train,
+                    y=y_train,
+                    train_inds=train_inds,
+                    test_inds=val_inds)
+
+            scores = cross_validate_with_optimal_threshold(
+                    estimator=estimator,
+                    score_func_threshold=score_func_threshold,
+                    X=X_train_fold,
+                    y=y_train_fold,
+                    X_val=X_val_fold,
+                    y_val=y_val_fold,
+                    cv_threshold=cv_fold,
+                    scoring=scoring,
+                    threshold_set=threshold_set,
+                    scores=scores)
+
+        return scores
+
+
+if __name__ == "__main__":
+
+    from sklearn.metrics import accuracy_score, precision_score
+    from sklearn.datasets import load_breast_cancer
+    from xgboost import XGBRFClassifier
+    from sklearn.model_selection import train_test_split
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    X_train, X_val, y_train, y_val = train_test_split(X, y)
+
+    estimator = XGBRFClassifier(use_label_encoder=False,
+                                eval_metric="logloss")
+
+    score_func = accuracy_score
+
+    scoring = {"precision": precision_score}
+
+    averaged_scores = []
+    averaged_thresholds = []
+
+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
+
+    scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
+            estimator=estimator,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=None,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=None,
+            y_val_threshold=None,
+            cv_threshold=None)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    X_train, X_val_threshold, y_train, y_val_threshold =\
+        train_test_split(X_train, y_train)
+
+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
+
+    scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
+            estimator=estimator,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=None,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv_threshold=None)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=None, cv_threshold=3 \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
+            estimator=estimator,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=None,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv_threshold=3)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=3, cv_threshold=None \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
+            estimator=estimator,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=3,
+            X_val=None,
+            y_val=None,
+            X_val_threshold=None,
+            y_val_threshold=None,
+            cv_threshold=None)
+
+    print("\nScores:", scores)
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            score_func_threshold=accuracy_score,
+            estimator=estimator,
+            X=X_train,
+            y=y_train,
+            scoring=scoring,
+            cv=3,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv_threshold=[3, 3, 3])
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score_threshold"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    # TODO: check overwriting X_train,
+    # additional metrics append instead of overwrite
+    # check the length of cv_threshold
+    # test custom cv, cv_threshold
+
+    print("\n Averaged test score:", averaged_scores)
+    print("\n Averaged threshold:", averaged_thresholds)

+ 824 - 0
cdplib/pipeline_selector/PipelineSelector.py

@@ -0,0 +1,824 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 30 14:23:23 2020
+
+@author: tanya
+@description: an abstract class for selecting a machine learning
+ pipeline from a space (deterministic or random) of parameter distributions
+ over multiple pipelines.
+ The selection is thought in such a way that a Trials object is being
+ maintained during the tuning process from which one can retrieve
+ the best pipeline so far
+ as well as the entire tuning history if needed.
+ Methods configure_cross_validation and configure_result_saving
+ allow to use a custom cross-validation method and
+ save the current best result in a file or database during training.
+ Children classes: hyperopt and custom gridsearch.
+"""
+
+import pickle
+import os
+import sys
+import time
+import datetime
+import numpy as np
+import pandas as pd
+from copy import deepcopy
+from abc import ABC, abstractmethod, abstractproperty
+
+if sys.version_info >= (3, 8):
+    from typing import Callable, TypedDict,\
+    Literal, Dict, Iterable, List, Tuple, Union
+else:
+    from typing_extensions import Callable, TypedDict,\
+    Literal, Dict, Iterable, List, Tuple, Union
+
+import functools
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import cross_validate as sklearn_cross_validation
+from sklearn.metrics import make_scorer
+from hyperopt import STATUS_OK, STATUS_FAIL
+from cdplib.log import Log
+from cdplib.utils import ExceptionsHandler
+from cdplib.utils import LoadingUtils
+from cdplib.ml_validation import CVComposer
+
+sys.path.append(os.getcwd())
+
+
+class SpaceElementType(TypedDict):
+    name: str
+    pipeline: Pipeline
+    params: dict
+    
+# TODO Tanya: add possibility to include confusion matrix in
+# additional metrics
+# check that cv object contains indices
+
+class PipelineSelector(ABC):
+    """
+    An abstract class for selecting a machine learning
+    pipeline from a space (deterministic or random) of parameter
+    distributions over multiple pipelines.
+    The selection is though in such a way that a Trials object is being
+    maintained during the tuning process from which one can retrieve
+    the best pipeline so far as well as the entire tuning history
+    if needed.
+    Methods configure_cross_validation and configure_result_saving
+    allow to use a custom cross-validation method and
+    save the current best result in a file or database during training.
+    Children classes: hyperopt and custom gridsearch.
+    """
+    def __init__(self,
+                 cost_func: Union[Callable, str],
+                 greater_is_better: bool,
+                 trials_path: str,
+                 backup_trials_freq: int = None,
+                 cross_validation_needs_scorer: bool = True,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Dict[str, Callable] = None,
+                 additional_averaging_funcs: Dict[str, Callable] = None,
+                 strategy_name: str = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"):
+        """
+        :param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
+
+        :param bool greater_is_better: when True
+            cost_func is maximized, else minimized.
+
+        :param str trials_path: path at which the trials object is saved
+            in binary format. From the trials object we can
+            select information about the obtained scores, score variations,
+            and pipelines, and parameters tried out so far. If a trials object
+            already exists at the given path, it is loaded and the
+            search is continued, else, the search is started from scratch.
+
+        :param backup_trials_freq: frequecy in interations (trials)
+            of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
+
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores of the cost_func.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to keep track of
+            in the trials of the form {"metric_name": metric}.
+
+        :param additional_averaging_funcs: functions used to aggregate
+            the output of the cross_validate function.
+            The output always contains the scores of the cost_func,
+            additional_metrics (if it is not empty),
+            but it can also contain additional information
+            (like probability threshold for example)
+            if different from cross_val_averaging_func.
+            Of the form {"metric_name": averaging_func}
+
+            Remark:
+
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
+
+        :param str stdout_log_level: can be INFO, WARNING, ERROR
+        """
+        self._logger = Log("PipelineSelector: ",
+                           stdout_log_level=stdout_log_level)
+
+        try:
+
+            ExceptionsHandler(self._logger)\
+                .assert_is_directory(path=trials_path)
+
+            self.attached_space = False
+            self.attached_data = False
+            self.configured_cross_validation = False
+            self.configured_summary_saving = False
+
+            self._cost_func = cost_func
+            self._greater_is_better = greater_is_better
+            # score factor is 1 when cost_func is minimized,
+            # -1 when cost func is maximized
+            self._score_factor = (not greater_is_better) - greater_is_better
+            self._cross_val_averaging_func = cross_val_averaging_func
+            self._additional_metrics = additional_metrics
+            self._additional_averaging_funcs = additional_averaging_funcs or {}
+            
+            self.trials_path = trials_path
+            self._backup_trials_freq = backup_trials_freq
+
+            self._strategy_name = strategy_name
+            self._data_path = None
+            self._cv_path = None
+            self._X = None
+            self._y = None
+            self._cv = None
+            self._space = None
+
+            # if cross-valition is not configured,
+            # sklearn cross-validation method is taken by default
+            self._cross_validation = sklearn_cross_validation
+            
+            self._cross_validation_needs_scorer = cross_validation_needs_scorer
+
+            # if a trials object already exists at the given path,
+            # it is loaded and the search is continued. Else,
+            # the search is started from the beginning.
+            if os.path.isfile(self.trials_path):
+
+                with open(self.trials_path, "rb") as f:
+                    self._trials = pickle.load(f)
+                    
+                if len(self._trials) == 0:
+                    self._trials = None
+                    
+            else:
+                self._trials = None
+                
+            if self._trials is not None:
+
+                self._start_iteration = self.number_of_trials
+
+                self.best_score = self.best_trial_score
+
+                self._logger.info(("Loaded an existing trials object"
+                                   "Consisting of {} trials")
+                                  .format(self._start_iteration))
+
+            else:
+                self._logger.warning(("No existing trials object was found, "
+                                      "Starting from scratch."))
+
+                self._trials = None
+                self._start_iteration = 0
+                self.best_score = np.nan
+
+            # keeping track of the current search iteration
+            self._iteration = self._start_iteration
+            self._score_improved = False
+
+            self.start_tuning_time = datetime.datetime.today()
+            self.total_tuning_time = None
+            self.finished_tuning = False
+            
+        except Exception as e:
+            err = ("Failed to initialize the class. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _backup_trials(self) -> None:
+        '''
+        Pickles (Saves) the trials object in binary format.
+        '''
+        try:
+            with open(self.trials_path, "wb") as f:
+                pickle.dump(self._trials, f)
+
+        except Exception as e:
+            err = "Could not backup trials. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    def configure_cross_validation(self,
+                                   cross_validation: Callable,
+                                   kwargs: dict = None) -> None:
+        """
+        Method for attaching a custom cross-validation function
+
+        :param cross_validation: a function that has the same
+             signature as sklearn.model_selection.cross_validate
+        """
+        try:
+            kwargs = kwargs or {}
+
+            self._cross_validation = functools.partial(
+                    cross_validation, **kwargs)
+
+            self.configured_cross_validation = True
+
+            self._logger.info("Configured cross validation")
+
+        except Exception as e:
+            err = ("Failed to configure cross-validation. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def configure_cross_validation_from_module(self,
+                                               module_path: str,
+                                               name: str) -> None:
+        """
+        Attaches a cross-validation funciton defined in
+        a different python model. This function must have
+        the same signature as sklearn.model_seclection.cross_validate
+
+        :param str module_path: path to python module
+            where the cross_validation function is defined.
+
+        :param str name: name of the cross validation function
+            loaded froma python module.
+        """
+        try:
+            self._cross_validation = \
+                LoadingUtils().load_from_module(
+                        module_path=module_path, name=name)
+
+            self.configured_cross_validation = True
+
+            self._logger.info("Configured cross validation")
+
+        except Exception as e:
+            err = ("Failed to load cross-validation from module. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def attach_space(self, space) -> None:
+        """
+        Method for attaching the pipeline/hyperparameter space
+        over which the score_func is optimized.
+
+        :param space: space where
+            the search is performed. A space might be either
+            a list of dictionaries or a hyperopt space object
+            the elements of which are dictionaries with keys:
+            name, pipeline, params
+        """
+        try:
+            self._space = space
+
+            self.attached_space = True
+
+            self._logger.info("Attached parameter distribution space")
+
+        except Exception as e:
+            err = ("Failed to attach space. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def attach_space_from_module(self, module_path: str, name: str) -> None:
+        """
+        Attaches a space defined in a different python module.
+
+        :param str module_path: path to python module
+            where the space is defined.
+
+        :param str name: name of the space loaded from
+            a python module.
+        """
+        try:
+            self._space = LoadingUtils().load_from_module(
+                    module_path=module_path, name=name)
+
+            self.attached_space = True
+
+            self._logger.info("Attached parameter distribution space")
+
+        except Exception as e:
+            err = ("Failed to attach space from module. "
+                   "Exit with error {}".format(e))
+
+            self._logger.loger_and_raise_error(err)
+
+    def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
+                    y_train: Union[pd.DataFrame, pd.Series, np.ndarray]
+                    = None,
+                    X_val: Union[pd.DataFrame, np.ndarray]
+                    = None,
+                    y_val: Union[pd.DataFrame, pd.Series, np.ndarray]
+                    = None,
+                    cv: Union[Iterable[Tuple[List[int], List[int]]]]
+                    = None) -> None:
+        '''
+        :param array X_train: data on which
+            machine learning pipelines are trained
+
+        :param array y_train: optional, vector with targets,
+            (None in case of unsupervided learning)
+
+        :param array X_val: optional, validation data.
+            When not provided, cross-validated value
+            of the cost_func is calculated.
+
+        :param array y_val: optional, validation targets
+
+        :param list cv: iterabe of tuples containing
+            train and validation indices or an integer representing
+            the number of folds for a random split of data
+            during cross-validation
+            example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
+        '''
+        try:
+            assert((cv is None) == (X_val is not None)),\
+                "Either cv or X_val must be provided"
+
+            if cv is None:
+
+                assert((y_val is None) == (y_train is None)),\
+                    "y_train and y_val must be simultanious"
+
+                # Here we create a trivial cv object
+                # with one validation split.
+                
+                # XXX Tanya finish here
+                
+                cv = CVComposer.dummy_cv()
+
+                train_inds = list(range(len(X_train)))
+                val_inds = list(range(len(X_train),
+                                      len(X_train) + len(X_val)))
+
+                self._cv = [(train_inds, val_inds)]
+
+                self._X = np.concatenate([X_train, X_val])
+                self._y = None if y_train is None\
+                    else np.concatenate([y_train, y_val])
+
+            else:
+
+                self._cv = cv
+                self._X = X_train
+                self._y = y_train
+
+            self.attached_data = True
+
+            self._logger.info("Attached data")
+
+        except Exception as e:
+            err = ("Failed to attach data. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def attach_data_from_hdf5(self,
+                              data_hdf5_store_path: str,
+                              cv_pickle_path: str = None) -> None:
+        """
+        Method for attaching data from a hdf5 store
+         and a cv object from a pickled file.
+
+         The hdf5 store is a binary file,
+         after loading it, it is a dictionary with keys
+         X_train (y_train, X_val, y_val).
+
+         The cv is loaded from a pickle file.
+
+         The reason to separate the data
+         store from the cv store, is the hdf5 is optimized to
+         store large dataframes (especially with simple types) and
+         a a small list of lists like a cv-object is better
+         to be stored as a pickle file.
+
+        :param str data_hdf5_store_path: path to the hdf5 store
+            with train and validation data
+        :param str cv_pickle_path: path to the pickle file with
+            the cv data
+        """
+        try:
+            assert(os.path.isfile(data_hdf5_store_path)),\
+                "Parameter hdf5_store_path is not a file"
+                
+            # close all opened files, because hdf5 will 
+            # fail to reopen an opened (for some reason) file
+            import tables
+            tables.file._open_files.close_all()
+
+            store = pd.HDFStore(data_hdf5_store_path)
+
+            self._data_path = data_hdf5_store_path
+
+            data_input = {key: store[key] if key in store else None
+                          for key in ["X_train", "y_train", "X_val", "y_val"]}
+
+            if cv_pickle_path is not None:
+
+                assert(os.path.isfile(cv_pickle_path)),\
+                    "Parameter cv_pickle_path is not a file"
+
+                data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
+
+                self._cv_path = cv_pickle_path
+
+            else:
+                data_input["cv"] = None
+
+            self.attach_data(**data_input)
+
+            store.close()
+
+        except Exception as e:
+            err = "Failed to attach data. Exit with error: {}".format(e)
+            self._logger.log_and_raise_error(err)
+
+    @property
+    def default_summary(self) -> dict:
+        """
+        Default summary of the strategy.
+        Every the _objective function is called
+        the current score and the information
+        about the tested space element is added to the
+        summary and it is saved to the Trials.
+        If summary saving is configured it is also
+        saved to a file, or a database when the score improves.
+        """
+        summary = {}
+
+        if self._strategy_name is not None:
+            summary["strategy_name"] = self._strategy_name
+
+        if isinstance(self._cost_func, str):
+            summary["cost_func"] = self._cost_func
+
+        elif hasattr(self._cost_func, "__name__"):
+            summary["cost_func"] = self._cost_func.__name__
+
+        summary["trials_path"] = self.trials_path
+
+        if self._data_path is not None:
+            summary["data_path"] = self._data_path
+
+        if self._cv_path is not None:
+            summary["cv_path"] = self._cv_path
+
+        summary["start_tuning_time"] = self.start_tuning_time
+
+        summary["iteration"] = self._iteration
+
+        return summary
+
+    def configer_summary_saving(self,
+                                save_method: Callable
+                                = functools.partial(
+                                        pd.DataFrame.to_excel,
+                                        **{"path_or_buf": "result.csv"}),
+                                kwargs: dict = None) -> None:
+        """
+        When the score calculated by _objective function improves,
+        the default summary is updated with information about the
+        current score and pipeline/hyperparameters
+        and can be saved to a file or database, depending
+        on the configured save_method.
+
+        :param Callable save_method: method for saving the result
+            of the pipeline selection. The method must accept
+            a pandas DataFrame as argument.
+            By default, saving to an excel file.
+
+            Examples:
+                functools.partial(pd.DataFrame.to_csv,
+                                  **{"path_or_buf": <PATH>})
+                functools.partial(np.savetxt, **{"fname": <PATH>})
+
+                functools.partial(SQLHandler(<URI>).append_to_table,
+                                  **{"tablename": <NAME>})
+
+                functools.partial(MongodbHandler(<URI>).insert_data_into_collection,
+                                  **{"collection_name": <NAME>})
+
+            using functools can be avoided by providing the kwarg argument
+
+        :param dict kwargs: a dictionary with keyword arguments
+            (like tablename) to provide to the save_method
+        """
+        try:
+            kwargs = kwargs or {}
+
+            self._save_method = functools.partial(save_method, **kwargs)
+
+            self.configured_summary_saving = True
+
+            self._logger.info("Configured summary saving")
+
+        except Exception as e:
+            err = ("Failed to configure the summary saving. "
+                   "Exit with error {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _save_summary(self, summary: dict) -> None:
+        """
+        When the score calculated by _objective function improves,
+        the default summary is updated with information about the
+        current score and pipeline/hyperparameters
+        and can be saved to a file or database, depending
+        on the configured save_method.
+        """
+        try:
+            assert(self.configured_summary_saving),\
+                "Result saving must be configured first"
+
+            self._save_method(summary)
+
+        except Exception as e:
+            err = ("Could not configure summary saving. "
+                   "Exit with error: {}".format(e))
+
+            self._logger.log_and_raise_error(err)
+
+    def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
+        """
+        Calculates the averaged cross-validated score and score variance,
+        as well as the averaged values and variances of the additional metrics.
+
+        This method is called in the _objective function that is
+        passed to the hyperopt optimizer.
+
+        This function can be overriden, when the cost
+        needs to be calculated differently,
+        for example with a tensorflow model.
+
+        :param Pipeline pipeline: machine learning pipeline
+            that will be evaluated with cross-validation
+
+        :return: dictionary with the aggregated
+            cross-validation scores and
+            the score variances for the scores in the output
+            of the cross-validation function.
+
+            form of the output:
+                {"score": 10, #score used in optimization,
+                 "score_variance": 0.5
+                 "additional_metric1": 5,
+                 "additional_metric1_variance": 7}
+
+            a custom cross-validation function can also include for
+            example probability threshold for each fold, then
+            the output of this function will include the average
+            value and the variance of the probability threshold
+            over the folds.
+        """
+        try:
+            
+            scoring = {"score": self._cost_func} | self._additional_metrics
+            
+            if self._cross_validation_needs_scorer:
+                for metric_name, metric in scoring.itmes():
+                    scoring[metric_name] = make_scorer(
+                        metric, greater_is_better=self._greater_is_better)
+                    
+            cross_validation_input_args = {
+                 "estimator": pipeline,
+                 "X": self._X,
+                 "y": self._y,
+                 "cv": self._cv,
+                 "scoring": scoring
+                 }
+            
+            if "error_score" in self._cross_validation.__annotations__:
+                cross_validation_input_args["error_score"] = np.nan
+
+            scores = self._cross_validation(**cross_validation_input_args)
+
+            averaging_funcs = {
+                    metric_name: self._additional_averaging_funcs[metric_name]
+                    if metric_name in self._additional_averaging_funcs
+                    else self._cross_val_averaging_func
+                    for metric_name in scores}
+
+            scores_average = {
+                    metric_name.replace("test_", ""):
+                    averaging_funcs[metric_name](scores[metric_name])
+                    for metric_name in scores
+                    if metric_name.startswith("test")}
+
+            scores_variance = {
+                    metric_name.replace("test_", "") + "_variance":
+                    np.var(scores[metric_name])
+                    for metric_name in scores
+                    if metric_name.startswith("test")}
+
+            return {**scores_average, **scores_variance}
+
+        except Exception as e:
+            err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
+
+            self._logger.log_and_raise_error(err)
+
+    def _objective(self, space_element: SpaceElementType) -> dict:
+        '''
+        This method is called in run_trials method
+        that is using the hyperopt fmin opmizer.
+
+        Uses _evaluate method.
+
+        It must take as input a space element
+        and produce an output in the form of dictionary
+        with 2 obligatory values loss and status
+        (STATUS_OK or STATUS_FAIL). Other
+        values in the output are optional and can be
+        accessed later through the trials object.
+
+        :Warning: fmin minimizes the loss,
+        when _evaluate returns a value to be maximized,
+        it is multiplied by -1 to obtain loss.
+
+        :param SpaceElementType space_element: element
+            of the space over which the optimization is done
+
+        :output: dictionary with keys
+            loss (minimized value),
+            status with values STATUS_OK or STATUS_FAIL
+            uderstood by hyperopt,
+            score (equal to loss or -loss),
+            score_variance,
+            timestamp (end of execution),
+            train_time: execution time
+            and other keys given in self.default_summary
+        '''
+        try:
+            start_time = time.time()
+
+            assert(self.attached_data),\
+                ("Data must be attached in order "
+                 "in order to effectuate the best"
+                 "pipeline search")
+
+            summary = deepcopy(self.default_summary)
+
+            # backup the current trials if the score improved
+            # at previous iteration or every ith iteration
+            # if the backup_trials_freq is set
+            backup_cond = ((self._backup_trials_freq is not None) and
+                           ((self._iteration - self._start_iteration - 1) %
+                            self._backup_trials_freq == 0)) or\
+                self._score_improved
+
+            if backup_cond:
+                self._backup_trials()
+                self._score_improved = False
+
+            pipeline = space_element['pipeline']
+            params = space_element['params']
+            pipeline.set_params(**params)
+
+            self._logger.info(("Iteration {0}: "
+                               "Current score is {1}: "
+                               "Training pipeline {2} "
+                               "with parameters: {3}. ").format(
+                                  self._iteration,
+                                  self.best_score,
+                                  space_element['name'],
+                                  params))
+
+            result = self._evaluate(pipeline)
+
+            summary.update(result)
+
+            end_time = time.time()
+
+            summary['status'] = STATUS_OK
+            summary.update(result)
+            summary['loss'] = self._score_factor * summary['score']
+            summary['timestamp'] = datetime.datetime.today()
+            summary['train_time'] = end_time - start_time
+
+            self._iteration += 1
+
+            self._score_improved = (self.best_score != self.best_score) or\
+                                   (self._score_factor*result["score"] <
+                                    self._score_factor*self.best_score)
+
+            if self._score_improved:
+
+                self._logger.info("Score improved, new best score is: {}"
+                                  .format(result["score"]))
+
+                self.best_score = result['score']
+
+                if self.configured_summary_saving:
+                    self._save_summary(summary)
+
+        except Exception as e:
+
+            self._logger.warning("Trial failed with error {}".format(e))
+
+            summary = {}
+            summary['status'] = STATUS_FAIL
+            summary['timestamp'] = datetime.datetime.today()
+            summary['error'] = e
+            for key in ['loss', 'score', 'score_variance', 'train_time']:
+                summary[key] = np.nan
+
+        return summary
+
+    @abstractmethod
+    def run_trials(self):
+        """
+        Method that runs the hyperparameter tuning over possibly multiple
+        pipeline types specified in self.space
+        When run_trials method is finished the flag self.finished_tuning
+        should be set to True and the methods self._backup_trials and
+        optionally self._save_result should be called.
+        """
+        pass
+
+    @abstractproperty
+    def number_of_trials(self) -> int:
+        """
+        Number of trials already run in the current trials object
+        """
+        pass
+
+    @abstractproperty
+    def best_trial(self) -> dict:
+        """
+        Best trial sor far.
+         Should contain the status, pipeline,
+         hyperparameters, and the score (loss).
+         Other information is otional and is defined
+         by self.default_summary
+        """
+        pass
+
+    @abstractproperty
+    def best_trial_score(self) -> float:
+        """
+        Score of the best pipeline with the best hyperparameters
+        """
+        pass
+
+    @abstractproperty
+    def best_trial_score_variance(self) -> float:
+        """
+        Variance of the cross-validation score of the best pipeline
+        """
+        pass
+
+    @abstractproperty
+    def best_trial_pipeline(self) -> Pipeline:
+        """
+        Best pipeline with best hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def get_n_best_trial_pipelines(self, n: int) -> list:
+        """
+        N best pipelines with corresponding
+        best hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def get_n_best_trial_pipelines_of_each_type(self, n_int) -> list:
+        """
+        If the hyperparameter search is done over multiple
+        pipelines, then returns n different pipeline-types
+        with corresponding hyperparameters
+        """
+        pass
+
+    @abstractmethod
+    def trials_to_excel(self, path: str) -> None:
+        """
+        Trials object in the shape of table written to excel,
+        should contain the iteration, pipeline (as str),
+        hyperparamters (as str), self.best_result (see self._objective method)
+        as well as additional information defined by self.default_summary
+        """
+        pass

+ 1 - 0
cdplib/pipeline_selector/__init__.py

@@ -0,0 +1 @@
+from .PipelineSelector import *

+ 28 - 18
cdplib/utils/ExceptionsHandler.py

@@ -8,35 +8,45 @@ Created on Fri Sep 27 14:20:58 2019
 
 import os
 import sys
-import logging
 import pandas as pd
+from cdplib.log import Log
+
 sys.path.append(os.getcwd())
 
 
 class ExceptionsHandler:
     '''
     '''
-    def __init__(self):
+    def __init__(self, logger: Log = None):
         '''
         '''
+        self._logger = logger or Log("ExceptionHandler")
 
-    def check_is_file(self, path, logger=None):
+    def check_is_file(self, path: str):
         '''
         '''
-        if logger is None:
-            logger = logging.getLogger()
-
         if not os.path.isfile(path):
             err = "File {} not found".format(path)
-            logger.error(err)
+            self._logger.error(err)
             raise FileNotFoundError(err)
 
-    def _check_column_abscence(self, columns: (str, list), data: pd.DataFrame,
-                               error_or_warning: str, logger = None):
+    def assert_is_directory(self, path: str):
+        ""
+        ""
+        assert(isinstance(path, str)),\
+            "Parameter 'path' must of str type"
+
+        dirname = os.path.dirname("path")
+
+        if len(dirname) > 0:
+            os.mkdir(dirname, exists_ok=True)
+
+    def _check_column_abscence(self,
+                               columns: (str, list),
+                               data: pd.DataFrame,
+                               error_or_warning: str):
         '''
         '''
-        if logger is None:
-            logger = logging.getLogger()
         if isinstance(columns, str):
             columns = [columns]
 
@@ -44,23 +54,23 @@ class ExceptionsHandler:
 
             if column not in data.columns:
                 err = ("{} is not an internal column name".format(column))
-                getattr(logger, error_or_warning)(err)
+                getattr(self._logger, error_or_warning)(err)
 
                 if error_or_warning == "error":
                     raise Exception(err)
 
-    def error_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
+    def error_column_abscence(self,
+                              columns: (str, list),
+                              data: pd.DataFrame):
         '''
         '''
         return self._check_column_abscence(columns=columns,
                                            data=data,
-                                           error_or_warning="error",
-                                           logger=logger)
+                                           error_or_warning="error")
 
-    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame, logger = None):
+    def warn_column_abscence(self, columns: (str, list), data: pd.DataFrame):
         '''
         '''
         return self._check_column_abscence(columns=columns,
                                            data=data,
-                                           error_or_warning="warning",
-                                           logger=logger)
+                                           error_or_warning="warning")

+ 46 - 0
cdplib/utils/LoadingUtils.py

@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct  1 12:58:58 2020
+
+@author: tanya
+@description: class for methods of loading data from external sources
+"""
+
+import os
+import sys
+from cdplib.log import Log
+
+
+class LoadingUtils:
+    """
+    """
+    def __init__(self, logger=None):
+        """
+        """
+        self._logger = logger or Log("LoadingUtils")
+
+    def load_from_module(self, module_path: str, name: str):
+        """
+        """
+        for p in ["modele_path", "name"]:
+            assert(isinstance(p, str)),\
+                "Parameter '{}' must be of str type".format(p)
+
+            assert(os.path.isfile(module_path)),\
+                "Parameter 'module_path' must be a valid file"
+
+            module, extension = os.path.splitext(os.path.basename(module_path))
+
+            assert(extension == ",py"),\
+                "Parameter 'space' must be read from a python file"
+
+            sys.path.insert(module_path)
+
+            try:
+                from module import name
+                return name
+
+            except ImportError:
+                err = "Invalid space location or name"
+                self._logger.log_and_raise_error(err)

+ 36 - 0
cdplib/utils/TypeConverter.py

@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr 24 09:06:13 2020
+
+@author: tanya
+"""
+
+import numpy as np
+import pandas as pd
+
+class TypeConverter:
+    """
+    Library for methods to manage python types
+    """
+    def __init__(self):
+        """
+        """
+        from cdplib.log import Log
+
+        self._logger = Log("TypeConverter")
+
+    def convert_to_ndarray(self, x: (pd.DataFrame, np.ndarray)) -> np.ndarray:
+        '''
+        Converts an DataFrame to an numpy array.
+        '''
+        if isinstance(x, np.ndarray):
+            return x
+
+        elif (isinstance(x, pd.core.frame.DataFrame))\
+                or (isinstance(x, pd.core.series.Series)):
+            return x.values
+
+        else:
+            self._logger.log_and_raise_error_stack_info(
+                    'The argument must be a numpy array or a pandas DataFrame')