tanja 3 år sedan
förälder
incheckning
04ec30f765

+ 3 - 0
Pipfile

@@ -16,6 +16,9 @@ jsonref = "*"
 simplejson = "*"
 mysql = "*"
 hyperopt = "*"
+mypy = "*"
+data-science-types = "*"
+pytype = "*"
 
 [requires]
 python_version = "3"

+ 435 - 231
Pipfile.lock

@@ -1,7 +1,7 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "5ae0ad9df8502aead1689e37517dd3bb8d75ac1c9554b865563d395fb9c1f60a"
+            "sha256": "aaf6cb558761e9ff6ccf0035a08008b15fb12bceb916e49f27a47c406b4e0d2f"
         },
         "pipfile-spec": 6,
         "requires": {
@@ -16,24 +16,41 @@
         ]
     },
     "default": {
+        "attrs": {
+            "hashes": [
+                "sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6",
+                "sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700"
+            ],
+            "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
+            "version": "==20.3.0"
+        },
         "boltons": {
             "hashes": [
-                "sha256:6e890b173c5f2dcb4ec62320b3799342ecb1a6a0b2253014455387665d62c213",
-                "sha256:b3fc2b711f50cd975e726324d98e0bd5a324dd7e3b81d5e6a1b03c542d0c66c4"
+                "sha256:3dd8a8e3c1886e7f7ba3422b50f55a66e1700161bf01b919d098e7d96dd2d9b6",
+                "sha256:dd362291a460cc1e0c2e91cc6a60da3036ced77099b623112e8f833e6734bdc5"
             ],
-            "version": "==20.1.0"
+            "version": "==20.2.1"
         },
         "cdplib": {
             "editable": true,
             "git": "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git",
-            "ref": "36c286e8f5ff2d441504e2286b2c3408d9756c75"
+            "ref": "2eacfa61358654a7e3e9150ae13aed8de9de1dc3"
         },
         "cloudpickle": {
             "hashes": [
-                "sha256:38af54d0e7705d87a287bdefe1df00f936aadb1f629dca383e825cca927fa753",
-                "sha256:8664761f810efc07dbb301459e413c99b68fcc6d8703912bd39d86618ac631e3"
+                "sha256:3a32d0eb0bc6f4d0c57fbc4f3e3780f7a81e6fee0fa935072884d58ae8e1cc7c",
+                "sha256:9bc994f9e9447593bd0a45371f0e7ac7333710fcf64a4eb9834bf149f4ef2f32"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==1.6.0"
+        },
+        "data-science-types": {
+            "hashes": [
+                "sha256:20ddbaaac3f3299e2091a64e74f78e64f4899f4ab5644bfd97e4694bd7b62ef4",
+                "sha256:86218af525896f84f3a39eef254449d795644311a64df78fba5eaf76aa610d6d"
             ],
-            "version": "==1.3.0"
+            "index": "pypi",
+            "version": "==0.2.19"
         },
         "decorator": {
             "hashes": [
@@ -51,19 +68,26 @@
         },
         "hyperopt": {
             "hashes": [
-                "sha256:52f4534e101f139b074ae626e0b7dc8410854b9410475d3e7f10c429393bb1a2",
-                "sha256:8caf0094fe824502932d949ee57bd3c92fe512dbbd93b7b7a78cd0761fa1a78f",
-                "sha256:df450eadfc9541086921bf863a5842e7009faef472b08630fd2cab13cdcfe0e6"
+                "sha256:bc6047d50f956ae64eebcb34b1fd40f186a93e214957f20e87af2f10195295cc",
+                "sha256:dc5c7cceaf33c125b727cf92709e70035d94dd507831dae66406ac762a18a253"
             ],
             "index": "pypi",
-            "version": "==0.2.3"
+            "version": "==0.2.5"
+        },
+        "importlab": {
+            "hashes": [
+                "sha256:d855350d19dc10a17aabd2fe6f4b428ff1a936071f692fbf686a73694d26a51c"
+            ],
+            "markers": "python_full_version >= '2.7.0'",
+            "version": "==0.5.1"
         },
         "joblib": {
             "hashes": [
-                "sha256:0630eea4f5664c463f23fbf5dcfc54a2bc6168902719fa8e19daf033022786c8",
-                "sha256:bdb4fd9b72915ffb49fde2229ce482dd7ae79d842ed8c2b4c932441495af1403"
+                "sha256:698c311779f347cf6b7e6b8a39bb682277b8ee4aba8cf9507bc0cf4cd4737b72",
+                "sha256:9e284edd6be6b71883a63c9b7f124738a3c16195513ad940eae7e3438de885d5"
             ],
-            "version": "==0.14.1"
+            "markers": "python_version >= '3.6'",
+            "version": "==0.17.0"
         },
         "jsonref": {
             "hashes": [
@@ -73,6 +97,33 @@
             "index": "pypi",
             "version": "==0.2"
         },
+        "mypy": {
+            "hashes": [
+                "sha256:0a0d102247c16ce93c97066443d11e2d36e6cc2a32d8ccc1f705268970479324",
+                "sha256:0d34d6b122597d48a36d6c59e35341f410d4abfa771d96d04ae2c468dd201abc",
+                "sha256:2170492030f6faa537647d29945786d297e4862765f0b4ac5930ff62e300d802",
+                "sha256:2842d4fbd1b12ab422346376aad03ff5d0805b706102e475e962370f874a5122",
+                "sha256:2b21ba45ad9ef2e2eb88ce4aeadd0112d0f5026418324176fd494a6824b74975",
+                "sha256:72060bf64f290fb629bd4a67c707a66fd88ca26e413a91384b18db3876e57ed7",
+                "sha256:af4e9ff1834e565f1baa74ccf7ae2564ae38c8df2a85b057af1dbbc958eb6666",
+                "sha256:bd03b3cf666bff8d710d633d1c56ab7facbdc204d567715cb3b9f85c6e94f669",
+                "sha256:c614194e01c85bb2e551c421397e49afb2872c88b5830e3554f0519f9fb1c178",
+                "sha256:cf4e7bf7f1214826cf7333627cb2547c0db7e3078723227820d0a2490f117a01",
+                "sha256:da56dedcd7cd502ccd3c5dddc656cb36113dd793ad466e894574125945653cea",
+                "sha256:e86bdace26c5fe9cf8cb735e7cedfe7850ad92b327ac5d797c656717d2ca66de",
+                "sha256:e97e9c13d67fbe524be17e4d8025d51a7dca38f90de2e462243ab8ed8a9178d1",
+                "sha256:eea260feb1830a627fb526d22fbb426b750d9f5a47b624e8d5e7e004359b219c"
+            ],
+            "index": "pypi",
+            "version": "==0.790"
+        },
+        "mypy-extensions": {
+            "hashes": [
+                "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
+                "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
+            ],
+            "version": "==0.4.3"
+        },
         "mysql": {
             "hashes": [
                 "sha256:55e66b5e7b3823b1da5fb2a063e95a628fb850b2a0b76bdcd884faac5d2daa7d"
@@ -82,137 +133,175 @@
         },
         "mysqlclient": {
             "hashes": [
-                "sha256:4c82187dd6ab3607150fbb1fa5ef4643118f3da122b8ba31c3149ddd9cf0cb39",
-                "sha256:9e6080a7aee4cc6a06b58b59239f20f1d259c1d2fddf68ddeed242d2311c7087",
-                "sha256:f3fdaa9a38752a3b214a6fe79d7cae3653731a53e577821f9187e67cbecb2e16",
-                "sha256:f646f8d17d02be0872291f258cce3813497bc7888cd4712a577fd1e719b2f213"
+                "sha256:3f39855a4ad22805361e782cc4d1010ac74796225fa2d1c03cc16673ccdc983a",
+                "sha256:a6b5648f648b16335e3b1aaec93dc3fcc81a9a661180e306936437cc522c810b",
+                "sha256:edd42ccaa444b00702d5374b2f5f7585c9d0ce201917f15339f1c3cf91c1b1ed",
+                "sha256:fb2f75aea14722390d2d8ddf384ad99da708c707a96656210a7be8af20a2c5e5"
             ],
-            "version": "==1.4.6"
+            "markers": "python_version >= '3.5'",
+            "version": "==2.0.1"
         },
         "networkx": {
             "hashes": [
-                "sha256:45e56f7ab6fe81652fb4bc9f44faddb0e9025f469f602df14e3b2551c2ea5c8b"
+                "sha256:7978955423fbc9639c10498878be59caf99b44dc304c2286162fd24b458c1602",
+                "sha256:8c5812e9f798d37c50570d15c4a69d5710a18d77bafc903ee9c5fba7454c616c"
+            ],
+            "markers": "python_version >= '3.6'",
+            "version": "==2.5"
+        },
+        "ninja": {
+            "hashes": [
+                "sha256:06a72090f5c5516e57f12699644179504a77585bed6d5f8be9e67219a398ec80",
+                "sha256:16fc1bea52a36a91a0e80c3b221d2c1bc9bcf04d0564da9344e349b8c5efd5c6",
+                "sha256:1d9ed3b5fdeb646516f54bec92453dcb3000d6771c2fea56451444c988a23e29",
+                "sha256:24acc95359308d11243386cf9f076bdc95f438ef6a4e0e357e7c122c5e02816d",
+                "sha256:4252ce532304841e47478bb61710fcf9940cf2c91731303490762b6e4f23fd2b",
+                "sha256:5c3a8cb54aaaf5d4f692d65121ef47b3e43dea123a6563153d9d97631c0adf4f",
+                "sha256:621fd73513a9bef0cb82e8c531a29ef96580b4d6e797f833cce167054ad812f8",
+                "sha256:99c6102ae9a8981afe4d06f92508dbeab1e28ec89783fb703411166f4e13c9ee",
+                "sha256:a1a9d9455623a3f45557fff6eb5abb3e70910dde28cfb9239e3ca14249149f55",
+                "sha256:c6059bd04ad235e2326b39bc71bb7989de8d565084b5f269557704747b2910fa",
+                "sha256:fb1ae96811a9b73773014b8a21d710b89d7d5f765427a5e2541e7fb9d530fdd5"
             ],
-            "version": "==2.2"
+            "version": "==1.10.0.post2"
         },
         "numpy": {
             "hashes": [
-                "sha256:1598a6de323508cfeed6b7cd6c4efb43324f4692e20d1f76e1feec7f59013448",
-                "sha256:1b0ece94018ae21163d1f651b527156e1f03943b986188dd81bc7e066eae9d1c",
-                "sha256:2e40be731ad618cb4974d5ba60d373cdf4f1b8dcbf1dcf4d9dff5e212baf69c5",
-                "sha256:4ba59db1fcc27ea31368af524dcf874d9277f21fd2e1f7f1e2e0c75ee61419ed",
-                "sha256:59ca9c6592da581a03d42cc4e270732552243dc45e87248aa8d636d53812f6a5",
-                "sha256:5e0feb76849ca3e83dd396254e47c7dba65b3fa9ed3df67c2556293ae3e16de3",
-                "sha256:6d205249a0293e62bbb3898c4c2e1ff8a22f98375a34775a259a0523111a8f6c",
-                "sha256:6fcc5a3990e269f86d388f165a089259893851437b904f422d301cdce4ff25c8",
-                "sha256:82847f2765835c8e5308f136bc34018d09b49037ec23ecc42b246424c767056b",
-                "sha256:87902e5c03355335fc5992a74ba0247a70d937f326d852fc613b7f53516c0963",
-                "sha256:9ab21d1cb156a620d3999dd92f7d1c86824c622873841d6b080ca5495fa10fef",
-                "sha256:a1baa1dc8ecd88fb2d2a651671a84b9938461e8a8eed13e2f0a812a94084d1fa",
-                "sha256:a244f7af80dacf21054386539699ce29bcc64796ed9850c99a34b41305630286",
-                "sha256:a35af656a7ba1d3decdd4fae5322b87277de8ac98b7d9da657d9e212ece76a61",
-                "sha256:b1fe1a6f3a6f355f6c29789b5927f8bd4f134a4bd9a781099a7c4f66af8850f5",
-                "sha256:b5ad0adb51b2dee7d0ee75a69e9871e2ddfb061c73ea8bc439376298141f77f5",
-                "sha256:ba3c7a2814ec8a176bb71f91478293d633c08582119e713a0c5351c0f77698da",
-                "sha256:cd77d58fb2acf57c1d1ee2835567cd70e6f1835e32090538f17f8a3a99e5e34b",
-                "sha256:cdb3a70285e8220875e4d2bc394e49b4988bdb1298ffa4e0bd81b2f613be397c",
-                "sha256:deb529c40c3f1e38d53d5ae6cd077c21f1d49e13afc7936f7f868455e16b64a0",
-                "sha256:e7894793e6e8540dbeac77c87b489e331947813511108ae097f1715c018b8f3d"
+                "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db",
+                "sha256:09c12096d843b90eafd01ea1b3307e78ddd47a55855ad402b157b6c4862197ce",
+                "sha256:13d166f77d6dc02c0a73c1101dd87fdf01339febec1030bd810dcd53fff3b0f1",
+                "sha256:141ec3a3300ab89c7f2b0775289954d193cc8edb621ea05f99db9cb181530512",
+                "sha256:16c1b388cc31a9baa06d91a19366fb99ddbe1c7b205293ed072211ee5bac1ed2",
+                "sha256:18bed2bcb39e3f758296584337966e68d2d5ba6aab7e038688ad53c8f889f757",
+                "sha256:1aeef46a13e51931c0b1cf8ae1168b4a55ecd282e6688fdb0a948cc5a1d5afb9",
+                "sha256:27d3f3b9e3406579a8af3a9f262f5339005dd25e0ecf3cf1559ff8a49ed5cbf2",
+                "sha256:2a2740aa9733d2e5b2dfb33639d98a64c3b0f24765fed86b0fd2aec07f6a0a08",
+                "sha256:4377e10b874e653fe96985c05feed2225c912e328c8a26541f7fc600fb9c637b",
+                "sha256:448ebb1b3bf64c0267d6b09a7cba26b5ae61b6d2dbabff7c91b660c7eccf2bdb",
+                "sha256:50e86c076611212ca62e5a59f518edafe0c0730f7d9195fec718da1a5c2bb1fc",
+                "sha256:5734bdc0342aba9dfc6f04920988140fb41234db42381cf7ccba64169f9fe7ac",
+                "sha256:64324f64f90a9e4ef732be0928be853eee378fd6a01be21a0a8469c4f2682c83",
+                "sha256:6ae6c680f3ebf1cf7ad1d7748868b39d9f900836df774c453c11c5440bc15b36",
+                "sha256:6d7593a705d662be5bfe24111af14763016765f43cb6923ed86223f965f52387",
+                "sha256:8cac8790a6b1ddf88640a9267ee67b1aee7a57dfa2d2dd33999d080bc8ee3a0f",
+                "sha256:8ece138c3a16db8c1ad38f52eb32be6086cc72f403150a79336eb2045723a1ad",
+                "sha256:9eeb7d1d04b117ac0d38719915ae169aa6b61fca227b0b7d198d43728f0c879c",
+                "sha256:a09f98011236a419ee3f49cedc9ef27d7a1651df07810ae430a6b06576e0b414",
+                "sha256:a5d897c14513590a85774180be713f692df6fa8ecf6483e561a6d47309566f37",
+                "sha256:ad6f2ff5b1989a4899bf89800a671d71b1612e5ff40866d1f4d8bcf48d4e5764",
+                "sha256:c42c4b73121caf0ed6cd795512c9c09c52a7287b04d105d112068c1736d7c753",
+                "sha256:cb1017eec5257e9ac6209ac172058c430e834d5d2bc21961dceeb79d111e5909",
+                "sha256:d6c7bb82883680e168b55b49c70af29b84b84abb161cbac2800e8fcb6f2109b6",
+                "sha256:e452dc66e08a4ce642a961f134814258a082832c78c90351b75c41ad16f79f63",
+                "sha256:e5b6ed0f0b42317050c88022349d994fe72bfe35f5908617512cd8c8ef9da2a9",
+                "sha256:e9b30d4bd69498fc0c3fe9db5f62fffbb06b8eb9321f92cc970f2969be5e3949",
+                "sha256:ec149b90019852266fec2341ce1db513b843e496d5a8e8cdb5ced1923a92faab",
+                "sha256:edb01671b3caae1ca00881686003d16c2209e07b7ef8b7639f1867852b948f7c",
+                "sha256:f0d3929fe88ee1c155129ecd82f981b8856c5d97bcb0d5f23e9b4242e79d1de3",
+                "sha256:f29454410db6ef8126c83bd3c968d143304633d45dc57b51252afbd79d700893",
+                "sha256:fe45becb4c2f72a0907c1d0246ea6449fe7a9e2293bb0e11c4e9a32bb0930a15",
+                "sha256:fedbd128668ead37f33917820b704784aff695e0019309ad446a6d0b065b57e4"
             ],
-            "markers": "python_version >= '3.5'",
-            "version": "==1.18.2"
+            "markers": "python_version >= '3.6'",
+            "version": "==1.19.4"
         },
         "pandas": {
             "hashes": [
-                "sha256:07c1b58936b80eafdfe694ce964ac21567b80a48d972879a359b3ebb2ea76835",
-                "sha256:0ebe327fb088df4d06145227a4aa0998e4f80a9e6aed4b61c1f303bdfdf7c722",
-                "sha256:11c7cb654cd3a0e9c54d81761b5920cdc86b373510d829461d8f2ed6d5905266",
-                "sha256:12f492dd840e9db1688126216706aa2d1fcd3f4df68a195f9479272d50054645",
-                "sha256:167a1315367cea6ec6a5e11e791d9604f8e03f95b57ad227409de35cf850c9c5",
-                "sha256:1a7c56f1df8d5ad8571fa251b864231f26b47b59cbe41aa5c0983d17dbb7a8e4",
-                "sha256:1fa4bae1a6784aa550a1c9e168422798104a85bf9c77a1063ea77ee6f8452e3a",
-                "sha256:32f42e322fb903d0e189a4c10b75ba70d90958cc4f66a1781ed027f1a1d14586",
-                "sha256:387dc7b3c0424327fe3218f81e05fc27832772a5dffbed385013161be58df90b",
-                "sha256:6597df07ea361231e60c00692d8a8099b519ed741c04e65821e632bc9ccb924c",
-                "sha256:743bba36e99d4440403beb45a6f4f3a667c090c00394c176092b0b910666189b",
-                "sha256:858a0d890d957ae62338624e4aeaf1de436dba2c2c0772570a686eaca8b4fc85",
-                "sha256:863c3e4b7ae550749a0bb77fa22e601a36df9d2905afef34a6965bed092ba9e5",
-                "sha256:a210c91a02ec5ff05617a298ad6f137b9f6f5771bf31f2d6b6367d7f71486639",
-                "sha256:ca84a44cf727f211752e91eab2d1c6c1ab0f0540d5636a8382a3af428542826e",
-                "sha256:d234bcf669e8b4d6cbcd99e3ce7a8918414520aeb113e2a81aeb02d0a533d7f7"
+                "sha256:09e0503758ad61afe81c9069505f8cb8c1e36ea8cc1e6826a95823ef5b327daf",
+                "sha256:0a11a6290ef3667575cbd4785a1b62d658c25a2fd70a5adedba32e156a8f1773",
+                "sha256:0d9a38a59242a2f6298fff45d09768b78b6eb0c52af5919ea9e45965d7ba56d9",
+                "sha256:112c5ba0f9ea0f60b2cc38c25f87ca1d5ca10f71efbee8e0f1bee9cf584ed5d5",
+                "sha256:185cf8c8f38b169dbf7001e1a88c511f653fbb9dfa3e048f5e19c38049e991dc",
+                "sha256:3aa8e10768c730cc1b610aca688f588831fa70b65a26cb549fbb9f35049a05e0",
+                "sha256:41746d520f2b50409dffdba29a15c42caa7babae15616bcf80800d8cfcae3d3e",
+                "sha256:43cea38cbcadb900829858884f49745eb1f42f92609d368cabcc674b03e90efc",
+                "sha256:5378f58172bd63d8c16dd5d008d7dcdd55bf803fcdbe7da2dcb65dbbf322f05b",
+                "sha256:54404abb1cd3f89d01f1fb5350607815326790efb4789be60508f458cdd5ccbf",
+                "sha256:5dac3aeaac5feb1016e94bde851eb2012d1733a222b8afa788202b836c97dad5",
+                "sha256:5fdb2a61e477ce58d3f1fdf2470ee142d9f0dde4969032edaf0b8f1a9dafeaa2",
+                "sha256:6613c7815ee0b20222178ad32ec144061cb07e6a746970c9160af1ebe3ad43b4",
+                "sha256:6d2b5b58e7df46b2c010ec78d7fb9ab20abf1d306d0614d3432e7478993fbdb0",
+                "sha256:8a5d7e57b9df2c0a9a202840b2881bb1f7a648eba12dd2d919ac07a33a36a97f",
+                "sha256:8b4c2055ebd6e497e5ecc06efa5b8aa76f59d15233356eb10dad22a03b757805",
+                "sha256:a15653480e5b92ee376f8458197a58cca89a6e95d12cccb4c2d933df5cecc63f",
+                "sha256:a7d2547b601ecc9a53fd41561de49a43d2231728ad65c7713d6b616cd02ddbed",
+                "sha256:a979d0404b135c63954dea79e6246c45dd45371a88631cdbb4877d844e6de3b6",
+                "sha256:b1f8111635700de7ac350b639e7e452b06fc541a328cf6193cf8fc638804bab8",
+                "sha256:c5a3597880a7a29a31ebd39b73b2c824316ae63a05c3c8a5ce2aea3fc68afe35",
+                "sha256:c681e8fcc47a767bf868341d8f0d76923733cbdcabd6ec3a3560695c69f14a1e",
+                "sha256:cf135a08f306ebbcfea6da8bf775217613917be23e5074c69215b91e180caab4",
+                "sha256:e2b8557fe6d0a18db4d61c028c6af61bfed44ef90e419ed6fadbdc079eba141e"
             ],
             "index": "pypi",
-            "version": "==1.0.3"
+            "version": "==1.1.4"
         },
         "pymongo": {
             "hashes": [
-                "sha256:01b4e10027aef5bb9ecefbc26f5df3368ce34aef81df43850f701e716e3fe16d",
-                "sha256:0fc5aa1b1acf7f61af46fe0414e6a4d0c234b339db4c03a63da48599acf1cbfc",
-                "sha256:1396eb7151e0558b1f817e4b9d7697d5599e5c40d839a9f7270bd90af994ad82",
-                "sha256:18e84a3ec5e73adcb4187b8e5541b2ad61d716026ed9863267e650300d8bea33",
-                "sha256:19adf2848b80cb349b9891cc854581bbf24c338be9a3260e73159bdeb2264464",
-                "sha256:20ee0475aa2ba437b0a14806f125d696f90a8433d820fb558fdd6f052acde103",
-                "sha256:26798795097bdeb571f13942beef7e0b60125397811c75b7aa9214d89880dd1d",
-                "sha256:26e707a4eb851ec27bb969b5f1413b9b2eac28fe34271fa72329100317ea7c73",
-                "sha256:2a3c7ad01553b27ec553688a1e6445e7f40355fb37d925c11fcb50b504e367f8",
-                "sha256:2f07b27dbf303ea53f4147a7922ce91a26b34a0011131471d8aaf73151fdee9a",
-                "sha256:316f0cf543013d0c085e15a2c8abe0db70f93c9722c0f99b6f3318ff69477d70",
-                "sha256:31d11a600eea0c60de22c8bdcb58cda63c762891facdcb74248c36713240987f",
-                "sha256:334ef3ffd0df87ea83a0054454336159f8ad9c1b389e19c0032d9cb8410660e6",
-                "sha256:358ba4693c01022d507b96a980ded855a32dbdccc3c9331d0667be5e967f30ed",
-                "sha256:3a6568bc53103df260f5c7d2da36dffc5202b9a36c85540bba1836a774943794",
-                "sha256:444bf2f44264578c4085bb04493bfed0e5c1b4fe7c2704504d769f955cc78fe4",
-                "sha256:47a00b22c52ee59dffc2aad02d0bbfb20c26ec5b8de8900492bf13ad6901cf35",
-                "sha256:4c067db43b331fc709080d441cb2e157114fec60749667d12186cc3fc8e7a951",
-                "sha256:4c092310f804a5d45a1bcaa4191d6d016c457b6ed3982a622c35f729ff1c7f6b",
-                "sha256:53b711b33134e292ef8499835a3df10909c58df53a2a0308f598c432e9a62892",
-                "sha256:568d6bee70652d8a5af1cd3eec48b4ca1696fb1773b80719ebbd2925b72cb8f6",
-                "sha256:56fa55032782b7f8e0bf6956420d11e2d4e9860598dfe9c504edec53af0fc372",
-                "sha256:5a2c492680c61b440272341294172fa3b3751797b1ab983533a770e4fb0a67ac",
-                "sha256:61235cc39b5b2f593086d1d38f3fc130b2d125bd8fc8621d35bc5b6bdeb92bd2",
-                "sha256:619ac9aaf681434b4d4718d1b31aa2f0fce64f2b3f8435688fcbdc0c818b6c54",
-                "sha256:6238ac1f483494011abde5286282afdfacd8926659e222ba9b74c67008d3a58c",
-                "sha256:63752a72ca4d4e1386278bd43d14232f51718b409e7ac86bcf8810826b531113",
-                "sha256:6fdc5ccb43864065d40dd838437952e9e3da9821b7eac605ba46ada77f846bdf",
-                "sha256:7abc3a6825a346fa4621a6f63e3b662bbb9e0f6ffc32d30a459d695f20fb1a8b",
-                "sha256:7aef381bb9ae8a3821abd7f9d4d93978dbd99072b48522e181baeffcd95b56ae",
-                "sha256:80df3caf251fe61a3f0c9614adc6e2bfcffd1cd3345280896766712fb4b4d6d7",
-                "sha256:95f970f34b59987dee6f360d2e7d30e181d58957b85dff929eee4423739bd151",
-                "sha256:993257f6ca3cde55332af1f62af3e04ca89ce63c08b56a387cdd46136c72f2fa",
-                "sha256:9c0a57390549affc2b5dda24a38de03a5c7cbc58750cd161ff5d106c3c6eec80",
-                "sha256:a0794e987d55d2f719cc95fcf980fc62d12b80e287e6a761c4be14c60bd9fecc",
-                "sha256:a3b98121e68bf370dd8ea09df67e916f93ea95b52fc010902312168c4d1aff5d",
-                "sha256:a60756d55f0887023b3899e6c2923ba5f0042fb11b1d17810b4e07395404f33e",
-                "sha256:a676bd2fbc2309092b9bbb0083d35718b5420af3a42135ebb1e4c3633f56604d",
-                "sha256:a732838c78554c1257ff2492f5c8c4c7312d0aecd7f732149e255f3749edd5ee",
-                "sha256:ad3dc88dfe61f0f1f9b99c6bc833ea2f45203a937a18f0d2faa57c6952656012",
-                "sha256:ae65d65fde4135ef423a2608587c9ef585a3551fc2e4e431e7c7e527047581be",
-                "sha256:b070a4f064a9edb70f921bfdc270725cff7a78c22036dd37a767c51393fb956f",
-                "sha256:b6da85949aa91e9f8c521681344bd2e163de894a5492337fba8b05c409225a4f",
-                "sha256:bbf47110765b2a999803a7de457567389253f8670f7daafb98e059c899ce9764",
-                "sha256:bd9c1e6f92b4888ae3ef7ae23262c513b962f09f3fb3b48581dde5df7d7a860a",
-                "sha256:c06b3f998d2d7160db58db69adfb807d2ec307e883e2f17f6b87a1ef6c723f11",
-                "sha256:c318fb70542be16d3d4063cde6010b1e4d328993a793529c15a619251f517c39",
-                "sha256:c4aef42e5fa4c9d5a99f751fb79caa880dac7eaf8a65121549318b984676a1b7",
-                "sha256:c9ca545e93a9c2a3bdaa2e6e21f7a43267ff0813e8055adf2b591c13164c0c57",
-                "sha256:da2c3220eb55c4239dd8b982e213da0b79023cac59fe54ca09365f2bc7e4ad32",
-                "sha256:dd8055da300535eefd446b30995c0813cc4394873c9509323762a93e97c04c03",
-                "sha256:e2b46e092ea54b732d98c476720386ff2ccd126de1e52076b470b117bff7e409",
-                "sha256:e334c4f39a2863a239d38b5829e442a87f241a92da9941861ee6ec5d6380b7fe",
-                "sha256:e5c54f04ca42bbb5153aec5d4f2e3d9f81e316945220ac318abd4083308143f5",
-                "sha256:f4d06764a06b137e48db6d569dc95614d9d225c89842c885669ee8abc9f28c7a",
-                "sha256:f96333f9d2517c752c20a35ff95de5fc2763ac8cdb1653df0f6f45d281620606"
+                "sha256:03dc64a9aa7a5d405aea5c56db95835f6a2fa31b3502c5af1760e0e99210be30",
+                "sha256:05fcc6f9c60e6efe5219fbb5a30258adb3d3e5cbd317068f3d73c09727f2abb6",
+                "sha256:076a7f2f7c251635cf6116ac8e45eefac77758ee5a77ab7bd2f63999e957613b",
+                "sha256:137e6fa718c7eff270dbd2fc4b90d94b1a69c9e9eb3f3de9e850a7fd33c822dc",
+                "sha256:1f865b1d1c191d785106f54df9abdc7d2f45a946b45fd1ea0a641b4f982a2a77",
+                "sha256:213c445fe7e654621c6309e874627c35354b46ef3ee807f5a1927dc4b30e1a67",
+                "sha256:25e617daf47d8dfd4e152c880cd0741cbdb48e51f54b8de9ddbfe74ecd87dd16",
+                "sha256:3d9bb1ba935a90ec4809a8031efd988bdb13cdba05d9e9a3e9bf151bf759ecde",
+                "sha256:40696a9a53faa7d85aaa6fd7bef1cae08f7882640bad08c350fb59dee7ad069b",
+                "sha256:421aa1b92c291c429668bd8d8d8ec2bd00f183483a756928e3afbf2b6f941f00",
+                "sha256:4437300eb3a5e9cc1a73b07d22c77302f872f339caca97e9bf8cf45eca8fa0d2",
+                "sha256:455f4deb00158d5ec8b1d3092df6abb681b225774ab8a59b3510293b4c8530e3",
+                "sha256:475a34a0745c456ceffaec4ce86b7e0983478f1b6140890dff7b161e7bcd895b",
+                "sha256:4797c0080f41eba90404335e5ded3aa66731d303293a675ff097ce4ea3025bb9",
+                "sha256:4ae23fbbe9eadf61279a26eba866bbf161a6f7e2ffad14a42cf20e9cb8e94166",
+                "sha256:4b32744901ee9990aa8cd488ec85634f443526def1e5190a407dc107148249d7",
+                "sha256:50127b13b38e8e586d5e97d342689405edbd74ad0bd891d97ee126a8c7b6e45f",
+                "sha256:50531caa7b4be1c4ed5e2d5793a4e51cc9bd62a919a6fd3299ef7c902e206eab",
+                "sha256:63a5387e496a98170ffe638b435c0832c0f2011a6f4ff7a2880f17669fff8c03",
+                "sha256:68220b81850de8e966d4667d5c325a96c6ac0d6adb3d18935d6e3d325d441f48",
+                "sha256:689142dc0c150e9cb7c012d84cac2c346d40beb891323afb6caf18ec4caafae0",
+                "sha256:6a15e2bee5c4188369a87ed6f02de804651152634a46cca91966a11c8abd2550",
+                "sha256:7122ffe597b531fb065d3314e704a6fe152b81820ca5f38543e70ffcc95ecfd4",
+                "sha256:7307024b18266b302f4265da84bb1effb5d18999ef35b30d17592959568d5c0a",
+                "sha256:7a4a6f5b818988a3917ec4baa91d1143242bdfece8d38305020463955961266a",
+                "sha256:83c5a3ecd96a9f3f11cfe6dfcbcec7323265340eb24cc996acaecea129865a3a",
+                "sha256:890b0f1e18dbd898aeb0ab9eae1ab159c6bcbe87f0abb065b0044581d8614062",
+                "sha256:8deda1f7b4c03242f2a8037706d9584e703f3d8c74d6d9cac5833db36fe16c42",
+                "sha256:8ea13d0348b4c96b437d944d7068d59ed4a6c98aaa6c40d8537a2981313f1c66",
+                "sha256:91e96bf85b7c07c827d339a386e8a3cf2e90ef098c42595227f729922d0851df",
+                "sha256:96782ebb3c9e91e174c333208b272ea144ed2a684413afb1038e3b3342230d72",
+                "sha256:9755c726aa6788f076114dfdc03b92b03ff8860316cca00902cce88bcdb5fedd",
+                "sha256:9dbab90c348c512e03f146e93a5e2610acec76df391043ecd46b6b775d5397e6",
+                "sha256:9ee0eef254e340cc11c379f797af3977992a7f2c176f1a658740c94bf677e13c",
+                "sha256:9fc17fdac8f1973850d42e51e8ba6149d93b1993ed6768a24f352f926dd3d587",
+                "sha256:a2787319dc69854acdfd6452e6a8ba8f929aeb20843c7f090e04159fc18e6245",
+                "sha256:b7c522292407fa04d8195032493aac937e253ad9ae524aab43b9d9d242571f03",
+                "sha256:bd312794f51e37dcf77f013d40650fe4fbb211dd55ef2863839c37480bd44369",
+                "sha256:c0d660a186e36c526366edf8a64391874fe53cf8b7039224137aee0163c046df",
+                "sha256:c4869141e20769b65d2d72686e7a7eb141ce9f3168106bed3e7dcced54eb2422",
+                "sha256:cc4057f692ac35bbe82a0a908d42ce3a281c9e913290fac37d7fa3bd01307dfb",
+                "sha256:cccf1e7806f12300e3a3b48f219e111000c2538483e85c869c35c1ae591e6ce9",
+                "sha256:ce208f80f398522e49d9db789065c8ad2cd37b21bd6b23d30053474b7416af11",
+                "sha256:d0565481dc196986c484a7fb13214fc6402201f7fb55c65fd215b3324962fe6c",
+                "sha256:d1b3366329c45a474b3bbc9b9c95d4c686e03f35da7fd12bc144626d1f2a7c04",
+                "sha256:d226e0d4b9192d95079a9a29c04dd81816b1ce8903b8c174a39224fe978547cb",
+                "sha256:d38b35f6eef4237b1d0d8e845fc1546dad85c55eba447e28c211da8c7ef9697c",
+                "sha256:d64c98277ea80e4484f1332ab107e8dfd173a7dcf1bdbf10a9cccc97aaab145f",
+                "sha256:d9de8427a5601799784eb0e7fa1b031aa64086ce04de29df775a8ca37eedac41",
+                "sha256:e6a15cf8f887d9f578dd49c6fb3a99d53e1d922fdd67a245a67488d77bf56eb2",
+                "sha256:e8c446882cbb3774cd78c738c9f58220606b702b7c1655f1423357dc51674054",
+                "sha256:e8d188ee39bd0ffe76603da887706e4e7b471f613625899ddf1e27867dc6a0d3",
+                "sha256:ef76535776c0708a85258f6dc51d36a2df12633c735f6d197ed7dfcaa7449b99",
+                "sha256:f6efca006a81e1197b925a7d7b16b8f61980697bb6746587aad8842865233218"
             ],
             "index": "pypi",
-            "version": "==3.10.1"
+            "version": "==3.11.0"
         },
         "pymysql": {
             "hashes": [
-                "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a",
-                "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"
+                "sha256:263040d2779a3b84930f7ac9da5132be0fefcd6f453a885756656103f8ee1fdd",
+                "sha256:44f47128dda8676e021c8d2dbb49a82be9e4ab158b9f03e897152a3a287c69ea"
             ],
             "index": "pypi",
-            "version": "==0.9.3"
+            "version": "==0.10.1"
         },
         "python-dateutil": {
             "hashes": [
@@ -222,108 +311,153 @@
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
             "version": "==2.8.1"
         },
+        "pytype": {
+            "hashes": [
+                "sha256:01c2dc3664b550e5c571c432035eda85c5b1ba0bc2675f50bd24f226fda25fc2",
+                "sha256:1b63bfccdd68a8f8a80358fccf09c2a52b2e8d0e079e7ae9c034ba5df4356418",
+                "sha256:409ff5f52e767ec957014d1c5c1abf2e246446896d333c25f8f2a19de150f85e",
+                "sha256:6353e37f0df5037a1f18d0692b9b0b2d71ed0bb1e3b1d6d8d29458ef1a18cb81",
+                "sha256:926dea04b6fc9e396b69281679dbbe982f3825d8a3590ba63e671460d58ff192",
+                "sha256:e2ea11478665f7496f2e6f9b38956a01e47ab18462961ae5acfeb99c937dcef0",
+                "sha256:e97ff9dea170897e35fd1bf5934863176c7d97fbf533d2020ff0ab751dc2e389"
+            ],
+            "index": "pypi",
+            "version": "==2020.11.3"
+        },
         "pytz": {
             "hashes": [
-                "sha256:1c557d7d0e871de1f5ccd5833f60fb2550652da6be2693c1e02300743d21500d",
-                "sha256:b02c06db6cf09c12dd25137e563b31700d3b80fcc4ad23abb7a315f2789819be"
+                "sha256:3e6b7dd2d1e0a59084bcee14a17af60c5c562cdc16d828e8eba2e683d3a7e268",
+                "sha256:5c55e189b682d420be27c6995ba6edce0c0a77dd67bfbe2ae6607134d5851ffd"
             ],
-            "version": "==2019.3"
+            "version": "==2020.4"
+        },
+        "pyyaml": {
+            "hashes": [
+                "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97",
+                "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76",
+                "sha256:4f4b913ca1a7319b33cfb1369e91e50354d6f07a135f3b901aca02aa95940bd2",
+                "sha256:69f00dca373f240f842b2931fb2c7e14ddbacd1397d57157a9b005a6a9942648",
+                "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf",
+                "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f",
+                "sha256:7739fc0fa8205b3ee8808aea45e968bc90082c10aef6ea95e855e10abf4a37b2",
+                "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee",
+                "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d",
+                "sha256:cc8955cfbfc7a115fa81d85284ee61147059a753344bc51098f3ccd69b0d7e0c",
+                "sha256:d13155f591e6fcc1ec3b30685d50bf0711574e2c0dfffd7644babf8b5102ca1a"
+            ],
+            "version": "==5.3.1"
         },
         "scikit-learn": {
             "hashes": [
-                "sha256:1bf45e62799b6938357cfce19f72e3751448c4b27010e4f98553da669b5bbd86",
-                "sha256:267ad874b54c67b479c3b45eb132ef4a56ab2b27963410624a413a4e2a3fc388",
-                "sha256:2d1bb83d6c51a81193d8a6b5f31930e2959c0e1019d49bdd03f54163735dae4b",
-                "sha256:349ba3d837fb3f7cb2b91486c43713e4b7de17f9e852f165049b1b7ac2f81478",
-                "sha256:3f4d8eea3531d3eaf613fa33f711113dfff6021d57a49c9d319af4afb46f72f0",
-                "sha256:4990f0e166292d2a0f0ee528233723bcfd238bfdb3ec2512a9e27f5695362f35",
-                "sha256:57538d138ba54407d21e27c306735cbd42a6aae0df6a5a30c7a6edde46b0017d",
-                "sha256:5b722e8bb708f254af028dc2da86d23df5371cba57e24f889b672e7b15423caa",
-                "sha256:6043e2c4ccfc68328c331b0fc19691be8fb02bd76d694704843a23ad651de902",
-                "sha256:672ea38eb59b739a8907ec063642b486bcb5a2073dda5b72b7983eeaf1fd67c1",
-                "sha256:73207dca6e70f8f611f28add185cf3a793c8232a1722f21d82259560dc35cd50",
-                "sha256:83fc104a799cb340054e485c25dfeee712b36f5638fb374eba45a9db490f16ff",
-                "sha256:8416150ab505f1813da02cdbdd9f367b05bfc75cf251235015bb09f8674358a0",
-                "sha256:84e759a766c315deb5c85139ff879edbb0aabcddb9358acf499564ed1c21e337",
-                "sha256:8ed66ab27b3d68e57bb1f315fc35e595a5c4a1f108c3420943de4d18fc40e615",
-                "sha256:a7f8aa93f61aaad080b29a9018db93ded0586692c03ddf2122e47dd1d3a14e1b",
-                "sha256:ddd3bf82977908ff69303115dd5697606e669d8a7eafd7d83bb153ef9e11bd5e",
-                "sha256:de9933297f8659ee3bb330eafdd80d74cd73d5dab39a9026b65a4156bc479063",
-                "sha256:ea91a70a992ada395efc3d510cf011dc2d99dc9037bb38cd1cb00e14745005f5",
-                "sha256:eb4c9f0019abb374a2e55150f070a333c8f990b850d1eb4dfc2765fc317ffc7c",
-                "sha256:ffce8abfdcd459e72e5b91727b247b401b22253cbd18d251f842a60e26262d6f"
+                "sha256:0a127cc70990d4c15b1019680bfedc7fec6c23d14d3719fdf9b64b22d37cdeca",
+                "sha256:0d39748e7c9669ba648acf40fb3ce96b8a07b240db6888563a7cb76e05e0d9cc",
+                "sha256:1b8a391de95f6285a2f9adffb7db0892718950954b7149a70c783dc848f104ea",
+                "sha256:20766f515e6cd6f954554387dfae705d93c7b544ec0e6c6a5d8e006f6f7ef480",
+                "sha256:2aa95c2f17d2f80534156215c87bee72b6aa314a7f8b8fe92a2d71f47280570d",
+                "sha256:5ce7a8021c9defc2b75620571b350acc4a7d9763c25b7593621ef50f3bd019a2",
+                "sha256:6c28a1d00aae7c3c9568f61aafeaad813f0f01c729bee4fd9479e2132b215c1d",
+                "sha256:7671bbeddd7f4f9a6968f3b5442dac5f22bf1ba06709ef888cc9132ad354a9ab",
+                "sha256:914ac2b45a058d3f1338d7736200f7f3b094857758895f8667be8a81ff443b5b",
+                "sha256:98508723f44c61896a4e15894b2016762a55555fbf09365a0bb1870ecbd442de",
+                "sha256:a64817b050efd50f9abcfd311870073e500ae11b299683a519fbb52d85e08d25",
+                "sha256:cb3e76380312e1f86abd20340ab1d5b3cc46a26f6593d3c33c9ea3e4c7134028",
+                "sha256:d0dcaa54263307075cb93d0bee3ceb02821093b1b3d25f66021987d305d01dce",
+                "sha256:d9a1ce5f099f29c7c33181cc4386660e0ba891b21a60dc036bf369e3a3ee3aec",
+                "sha256:da8e7c302003dd765d92a5616678e591f347460ac7b53e53d667be7dfe6d1b10",
+                "sha256:daf276c465c38ef736a79bd79fc80a249f746bcbcae50c40945428f7ece074f8"
             ],
-            "markers": "python_version >= '3.5'",
-            "version": "==0.22.2.post1"
+            "markers": "python_version >= '3.6'",
+            "version": "==0.23.2"
         },
         "scipy": {
             "hashes": [
-                "sha256:00af72998a46c25bdb5824d2b729e7dabec0c765f9deb0b504f928591f5ff9d4",
-                "sha256:0902a620a381f101e184a958459b36d3ee50f5effd186db76e131cbefcbb96f7",
-                "sha256:1e3190466d669d658233e8a583b854f6386dd62d655539b77b3fa25bfb2abb70",
-                "sha256:2cce3f9847a1a51019e8c5b47620da93950e58ebc611f13e0d11f4980ca5fecb",
-                "sha256:3092857f36b690a321a662fe5496cb816a7f4eecd875e1d36793d92d3f884073",
-                "sha256:386086e2972ed2db17cebf88610aab7d7f6e2c0ca30042dc9a89cf18dcc363fa",
-                "sha256:71eb180f22c49066f25d6df16f8709f215723317cc951d99e54dc88020ea57be",
-                "sha256:770254a280d741dd3436919d47e35712fb081a6ff8bafc0f319382b954b77802",
-                "sha256:787cc50cab3020a865640aba3485e9fbd161d4d3b0d03a967df1a2881320512d",
-                "sha256:8a07760d5c7f3a92e440ad3aedcc98891e915ce857664282ae3c0220f3301eb6",
-                "sha256:8d3bc3993b8e4be7eade6dcc6fd59a412d96d3a33fa42b0fa45dc9e24495ede9",
-                "sha256:9508a7c628a165c2c835f2497837bf6ac80eb25291055f56c129df3c943cbaf8",
-                "sha256:a144811318853a23d32a07bc7fd5561ff0cac5da643d96ed94a4ffe967d89672",
-                "sha256:a1aae70d52d0b074d8121333bc807a485f9f1e6a69742010b33780df2e60cfe0",
-                "sha256:a2d6df9eb074af7f08866598e4ef068a2b310d98f87dc23bd1b90ec7bdcec802",
-                "sha256:bb517872058a1f087c4528e7429b4a44533a902644987e7b2fe35ecc223bc408",
-                "sha256:c5cac0c0387272ee0e789e94a570ac51deb01c796b37fb2aad1fb13f85e2f97d",
-                "sha256:cc971a82ea1170e677443108703a2ec9ff0f70752258d0e9f5433d00dda01f59",
-                "sha256:dba8306f6da99e37ea08c08fef6e274b5bf8567bb094d1dbe86a20e532aca088",
-                "sha256:dc60bb302f48acf6da8ca4444cfa17d52c63c5415302a9ee77b3b21618090521",
-                "sha256:dee1bbf3a6c8f73b6b218cb28eed8dd13347ea2f87d572ce19b289d6fd3fbc59"
+                "sha256:168c45c0c32e23f613db7c9e4e780bc61982d71dcd406ead746c7c7c2f2004ce",
+                "sha256:213bc59191da2f479984ad4ec39406bf949a99aba70e9237b916ce7547b6ef42",
+                "sha256:25b241034215247481f53355e05f9e25462682b13bd9191359075682adcd9554",
+                "sha256:2c872de0c69ed20fb1a9b9cf6f77298b04a26f0b8720a5457be08be254366c6e",
+                "sha256:3397c129b479846d7eaa18f999369a24322d008fac0782e7828fa567358c36ce",
+                "sha256:368c0f69f93186309e1b4beb8e26d51dd6f5010b79264c0f1e9ca00cd92ea8c9",
+                "sha256:3d5db5d815370c28d938cf9b0809dade4acf7aba57eaf7ef733bfedc9b2474c4",
+                "sha256:4598cf03136067000855d6b44d7a1f4f46994164bcd450fb2c3d481afc25dd06",
+                "sha256:4a453d5e5689de62e5d38edf40af3f17560bfd63c9c5bd228c18c1f99afa155b",
+                "sha256:4f12d13ffbc16e988fa40809cbbd7a8b45bc05ff6ea0ba8e3e41f6f4db3a9e47",
+                "sha256:634568a3018bc16a83cda28d4f7aed0d803dd5618facb36e977e53b2df868443",
+                "sha256:65923bc3809524e46fb7eb4d6346552cbb6a1ffc41be748535aa502a2e3d3389",
+                "sha256:6b0ceb23560f46dd236a8ad4378fc40bad1783e997604ba845e131d6c680963e",
+                "sha256:8c8d6ca19c8497344b810b0b0344f8375af5f6bb9c98bd42e33f747417ab3f57",
+                "sha256:9ad4fcddcbf5dc67619379782e6aeef41218a79e17979aaed01ed099876c0e62",
+                "sha256:a254b98dbcc744c723a838c03b74a8a34c0558c9ac5c86d5561703362231107d",
+                "sha256:b03c4338d6d3d299e8ca494194c0ae4f611548da59e3c038813f1a43976cb437",
+                "sha256:cc1f78ebc982cd0602c9a7615d878396bec94908db67d4ecddca864d049112f2",
+                "sha256:d6d25c41a009e3c6b7e757338948d0076ee1dd1770d1c09ec131f11946883c54",
+                "sha256:d84cadd7d7998433334c99fa55bcba0d8b4aeff0edb123b2a1dfcface538e474",
+                "sha256:e360cb2299028d0b0d0f65a5c5e51fc16a335f1603aa2357c25766c8dab56938",
+                "sha256:e98d49a5717369d8241d6cf33ecb0ca72deee392414118198a8e5b4c35c56340",
+                "sha256:ed572470af2438b526ea574ff8f05e7f39b44ac37f712105e57fc4d53a6fb660",
+                "sha256:f87b39f4d69cf7d7529d7b1098cb712033b17ea7714aed831b95628f483fd012",
+                "sha256:fa789583fc94a7689b45834453fec095245c7e69c58561dc159b5d5277057e4c"
             ],
-            "markers": "python_version >= '3.5'",
-            "version": "==1.4.1"
+            "markers": "python_version >= '3.6'",
+            "version": "==1.5.4"
         },
         "simplejson": {
             "hashes": [
-                "sha256:0fe3994207485efb63d8f10a833ff31236ed27e3b23dadd0bf51c9900313f8f2",
-                "sha256:17163e643dbf125bb552de17c826b0161c68c970335d270e174363d19e7ea882",
-                "sha256:1d1e929cdd15151f3c0b2efe953b3281b2fd5ad5f234f77aca725f28486466f6",
-                "sha256:1d346c2c1d7dd79c118f0cc7ec5a1c4127e0c8ffc83e7b13fc5709ff78c9bb84",
-                "sha256:1ea59f570b9d4916ae5540a9181f9c978e16863383738b69a70363bc5e63c4cb",
-                "sha256:1fbba86098bbfc1f85c5b69dc9a6d009055104354e0d9880bb00b692e30e0078",
-                "sha256:229edb079d5dd81bf12da952d4d825bd68d1241381b37d3acf961b384c9934de",
-                "sha256:22a7acb81968a7c64eba7526af2cf566e7e2ded1cb5c83f0906b17ff1540f866",
-                "sha256:2b4b2b738b3b99819a17feaf118265d0753d5536049ea570b3c43b51c4701e81",
-                "sha256:4cf91aab51b02b3327c9d51897960c554f00891f9b31abd8a2f50fd4a0071ce8",
-                "sha256:4fd5f79590694ebff8dc980708e1c182d41ce1fda599a12189f0ca96bf41ad70",
-                "sha256:5cfd495527f8b85ce21db806567de52d98f5078a8e9427b18e251c68bd573a26",
-                "sha256:60aad424e47c5803276e332b2a861ed7a0d46560e8af53790c4c4fb3420c26c2",
-                "sha256:7739940d68b200877a15a5ff5149e1599737d6dd55e302625650629350466418",
-                "sha256:7cce4bac7e0d66f3a080b80212c2238e063211fe327f98d764c6acbc214497fc",
-                "sha256:8027bd5f1e633eb61b8239994e6fc3aba0346e76294beac22a892eb8faa92ba1",
-                "sha256:86afc5b5cbd42d706efd33f280fec7bd7e2772ef54e3f34cf6b30777cd19a614",
-                "sha256:87d349517b572964350cc1adc5a31b493bbcee284505e81637d0174b2758ba17",
-                "sha256:8de378d589eccbc75941e480b4d5b4db66f22e4232f87543b136b1f093fff342",
-                "sha256:926bcbef9eb60e798eabda9cd0bbcb0fca70d2779aa0aa56845749d973eb7ad5",
-                "sha256:9a126c3a91df5b1403e965ba63b304a50b53d8efc908a8c71545ed72535374a3",
-                "sha256:ad8dd3454d0c65c0f92945ac86f7b9efb67fa2040ba1b0189540e984df904378",
-                "sha256:d140e9376e7f73c1f9e0a8e3836caf5eec57bbafd99259d56979da05a6356388",
-                "sha256:da00675e5e483ead345429d4f1374ab8b949fba4429d60e71ee9d030ced64037",
-                "sha256:daaf4d11db982791be74b23ff4729af2c7da79316de0bebf880fa2d60bcc8c5a",
-                "sha256:f4b64a1031acf33e281fd9052336d6dad4d35eee3404c95431c8c6bc7a9c0588",
-                "sha256:fc046afda0ed8f5295212068266c92991ab1f4a50c6a7144b69364bdee4a0159",
-                "sha256:fc9051d249dd5512e541f20330a74592f7a65b2d62e18122ca89bf71f94db748"
+                "sha256:034550078a11664d77bc1a8364c90bb7eef0e44c2dbb1fd0a4d92e3997088667",
+                "sha256:05b43d568300c1cd43f95ff4bfcff984bc658aa001be91efb3bb21df9d6288d3",
+                "sha256:0dd9d9c738cb008bfc0862c9b8fa6743495c03a0ed543884bf92fb7d30f8d043",
+                "sha256:10fc250c3edea4abc15d930d77274ddb8df4803453dde7ad50c2f5565a18a4bb",
+                "sha256:2862beabfb9097a745a961426fe7daf66e1714151da8bb9a0c430dde3d59c7c0",
+                "sha256:292c2e3f53be314cc59853bd20a35bf1f965f3bc121e007ab6fd526ed412a85d",
+                "sha256:2d3eab2c3fe52007d703a26f71cf649a8c771fcdd949a3ae73041ba6797cfcf8",
+                "sha256:2e7b57c2c146f8e4dadf84977a83f7ee50da17c8861fd7faf694d55e3274784f",
+                "sha256:311f5dc2af07361725033b13cc3d0351de3da8bede3397d45650784c3f21fbcf",
+                "sha256:344e2d920a7f27b4023c087ab539877a1e39ce8e3e90b867e0bfa97829824748",
+                "sha256:3fabde09af43e0cbdee407555383063f8b45bfb52c361bc5da83fcffdb4fd278",
+                "sha256:42b8b8dd0799f78e067e2aaae97e60d58a8f63582939af60abce4c48631a0aa4",
+                "sha256:4b3442249d5e3893b90cb9f72c7d6ce4d2ea144d2c0d9f75b9ae1e5460f3121a",
+                "sha256:55d65f9cc1b733d85ef95ab11f559cce55c7649a2160da2ac7a078534da676c8",
+                "sha256:5c659a0efc80aaaba57fcd878855c8534ecb655a28ac8508885c50648e6e659d",
+                "sha256:72d8a3ffca19a901002d6b068cf746be85747571c6a7ba12cbcf427bfb4ed971",
+                "sha256:75ecc79f26d99222a084fbdd1ce5aad3ac3a8bd535cd9059528452da38b68841",
+                "sha256:76ac9605bf2f6d9b56abf6f9da9047a8782574ad3531c82eae774947ae99cc3f",
+                "sha256:7d276f69bfc8c7ba6c717ba8deaf28f9d3c8450ff0aa8713f5a3280e232be16b",
+                "sha256:7f10f8ba9c1b1430addc7dd385fc322e221559d3ae49b812aebf57470ce8de45",
+                "sha256:8042040af86a494a23c189b5aa0ea9433769cc029707833f261a79c98e3375f9",
+                "sha256:813846738277729d7db71b82176204abc7fdae2f566e2d9fcf874f9b6472e3e6",
+                "sha256:845a14f6deb124a3bcb98a62def067a67462a000e0508f256f9c18eff5847efc",
+                "sha256:869a183c8e44bc03be1b2bbcc9ec4338e37fa8557fc506bf6115887c1d3bb956",
+                "sha256:8acf76443cfb5c949b6e781c154278c059b09ac717d2757a830c869ba000cf8d",
+                "sha256:8f713ea65958ef40049b6c45c40c206ab363db9591ff5a49d89b448933fa5746",
+                "sha256:934115642c8ba9659b402c8bdbdedb48651fb94b576e3b3efd1ccb079609b04a",
+                "sha256:9551f23e09300a9a528f7af20e35c9f79686d46d646152a0c8fc41d2d074d9b0",
+                "sha256:9a2b7543559f8a1c9ed72724b549d8cc3515da7daf3e79813a15bdc4a769de25",
+                "sha256:a55c76254d7cf8d4494bc508e7abb993a82a192d0db4552421e5139235604625",
+                "sha256:ad8f41c2357b73bc9e8606d2fa226233bf4d55d85a8982ecdfd55823a6959995",
+                "sha256:af4868da7dd53296cd7630687161d53a7ebe2e63814234631445697bd7c29f46",
+                "sha256:afebfc3dd3520d37056f641969ce320b071bc7a0800639c71877b90d053e087f",
+                "sha256:b59aa298137ca74a744c1e6e22cfc0bf9dca3a2f41f51bc92eb05695155d905a",
+                "sha256:bc00d1210567a4cdd215ac6e17dc00cb9893ee521cee701adfd0fa43f7c73139",
+                "sha256:c1cb29b1fced01f97e6d5631c3edc2dadb424d1f4421dad079cb13fc97acb42f",
+                "sha256:c94dc64b1a389a416fc4218cd4799aa3756f25940cae33530a4f7f2f54f166da",
+                "sha256:ceaa28a5bce8a46a130cd223e895080e258a88d51bf6e8de2fc54a6ef7e38c34",
+                "sha256:cff6453e25204d3369c47b97dd34783ca820611bd334779d22192da23784194b",
+                "sha256:d0b64409df09edb4c365d95004775c988259efe9be39697d7315c42b7a5e7e94",
+                "sha256:d4813b30cb62d3b63ccc60dd12f2121780c7a3068db692daeb90f989877aaf04",
+                "sha256:da3c55cdc66cfc3fffb607db49a42448785ea2732f055ac1549b69dcb392663b",
+                "sha256:e058c7656c44fb494a11443191e381355388443d543f6fc1a245d5d238544396",
+                "sha256:fed0f22bf1313ff79c7fc318f7199d6c2f96d4de3234b2f12a1eab350e597c06",
+                "sha256:ffd4e4877a78c84d693e491b223385e0271278f5f4e1476a4962dca6824ecfeb"
             ],
             "index": "pypi",
-            "version": "==3.17.0"
+            "version": "==3.17.2"
         },
         "six": {
             "hashes": [
-                "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
-                "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
+                "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259",
+                "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced"
             ],
             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==1.14.0"
+            "version": "==1.15.0"
         },
         "sklearn": {
             "hashes": [
@@ -333,50 +467,120 @@
         },
         "sqlalchemy": {
             "hashes": [
-                "sha256:083e383a1dca8384d0ea6378bd182d83c600ed4ff4ec8247d3b2442cf70db1ad",
-                "sha256:0a690a6486658d03cc6a73536d46e796b6570ac1f8a7ec133f9e28c448b69828",
-                "sha256:114b6ace30001f056e944cebd46daef38fdb41ebb98f5e5940241a03ed6cad43",
-                "sha256:128f6179325f7597a46403dde0bf148478f868df44841348dfc8d158e00db1f9",
-                "sha256:13d48cd8b925b6893a4e59b2dfb3e59a5204fd8c98289aad353af78bd214db49",
-                "sha256:211a1ce7e825f7142121144bac76f53ac28b12172716a710f4bf3eab477e730b",
-                "sha256:2dc57ee80b76813759cccd1a7affedf9c4dbe5b065a91fb6092c9d8151d66078",
-                "sha256:3e625e283eecc15aee5b1ef77203bfb542563fa4a9aa622c7643c7b55438ff49",
-                "sha256:43078c7ec0457387c79b8d52fff90a7ad352ca4c7aa841c366238c3e2cf52fdf",
-                "sha256:5b1bf3c2c2dca738235ce08079783ef04f1a7fc5b21cf24adaae77f2da4e73c3",
-                "sha256:6056b671aeda3fc451382e52ab8a753c0d5f66ef2a5ccc8fa5ba7abd20988b4d",
-                "sha256:68d78cf4a9dfade2e6cf57c4be19f7b82ed66e67dacf93b32bb390c9bed12749",
-                "sha256:7025c639ce7e170db845e94006cf5f404e243e6fc00d6c86fa19e8ad8d411880",
-                "sha256:7224e126c00b8178dfd227bc337ba5e754b197a3867d33b9f30dc0208f773d70",
-                "sha256:7d98e0785c4cd7ae30b4a451416db71f5724a1839025544b4edbd92e00b91f0f",
-                "sha256:8d8c21e9d4efef01351bf28513648ceb988031be4159745a7ad1b3e28c8ff68a",
-                "sha256:bbb545da054e6297242a1bb1ba88e7a8ffb679f518258d66798ec712b82e4e07",
-                "sha256:d00b393f05dbd4ecd65c989b7f5a81110eae4baea7a6a4cdd94c20a908d1456e",
-                "sha256:e18752cecaef61031252ca72031d4d6247b3212ebb84748fc5d1a0d2029c23ea"
+                "sha256:009e8388d4d551a2107632921320886650b46332f61dc935e70c8bcf37d8e0d6",
+                "sha256:0157c269701d88f5faf1fa0e4560e4d814f210c01a5b55df3cab95e9346a8bcc",
+                "sha256:0a92745bb1ebbcb3985ed7bda379b94627f0edbc6c82e9e4bac4fb5647ae609a",
+                "sha256:0cca1844ba870e81c03633a99aa3dc62256fb96323431a5dec7d4e503c26372d",
+                "sha256:166917a729b9226decff29416f212c516227c2eb8a9c9f920d69ced24e30109f",
+                "sha256:1f5f369202912be72fdf9a8f25067a5ece31a2b38507bb869306f173336348da",
+                "sha256:2909dffe5c9a615b7e6c92d1ac2d31e3026dc436440a4f750f4749d114d88ceb",
+                "sha256:2b5dafed97f778e9901b79cc01b88d39c605e0545b4541f2551a2fd785adc15b",
+                "sha256:2e9bd5b23bba8ae8ce4219c9333974ff5e103c857d9ff0e4b73dc4cb244c7d86",
+                "sha256:3aa6d45e149a16aa1f0c46816397e12313d5e37f22205c26e06975e150ffcf2a",
+                "sha256:4bdbdb8ca577c6c366d15791747c1de6ab14529115a2eb52774240c412a7b403",
+                "sha256:53fd857c6c8ffc0aa6a5a3a2619f6a74247e42ec9e46b836a8ffa4abe7aab327",
+                "sha256:5cdfe54c1e37279dc70d92815464b77cd8ee30725adc9350f06074f91dbfeed2",
+                "sha256:5d92c18458a4aa27497a986038d5d797b5279268a2de303cd00910658e8d149c",
+                "sha256:632b32183c0cb0053194a4085c304bc2320e5299f77e3024556fa2aa395c2a8b",
+                "sha256:7c735c7a6db8ee9554a3935e741cf288f7dcbe8706320251eb38c412e6a4281d",
+                "sha256:7cd40cb4bc50d9e87b3540b23df6e6b24821ba7e1f305c1492b0806c33dbdbec",
+                "sha256:84f0ac4a09971536b38cc5d515d6add7926a7e13baa25135a1dbb6afa351a376",
+                "sha256:8dcbf377529a9af167cbfc5b8acec0fadd7c2357fc282a1494c222d3abfc9629",
+                "sha256:950f0e17ffba7a7ceb0dd056567bc5ade22a11a75920b0e8298865dc28c0eff6",
+                "sha256:9e379674728f43a0cd95c423ac0e95262500f9bfd81d33b999daa8ea1756d162",
+                "sha256:b15002b9788ffe84e42baffc334739d3b68008a973d65fad0a410ca5d0531980",
+                "sha256:b6f036ecc017ec2e2cc2a40615b41850dc7aaaea6a932628c0afc73ab98ba3fb",
+                "sha256:bad73f9888d30f9e1d57ac8829f8a12091bdee4949b91db279569774a866a18e",
+                "sha256:bbc58fca72ce45a64bb02b87f73df58e29848b693869e58bd890b2ddbb42d83b",
+                "sha256:bca4d367a725694dae3dfdc86cf1d1622b9f414e70bd19651f5ac4fb3aa96d61",
+                "sha256:be41d5de7a8e241864189b7530ca4aaf56a5204332caa70555c2d96379e18079",
+                "sha256:bf53d8dddfc3e53a5bda65f7f4aa40fae306843641e3e8e701c18a5609471edf",
+                "sha256:c092fe282de83d48e64d306b4bce03114859cdbfe19bf8a978a78a0d44ddadb1",
+                "sha256:c3ab23ee9674336654bf9cac30eb75ac6acb9150dc4b1391bec533a7a4126471",
+                "sha256:ce64a44c867d128ab8e675f587aae7f61bd2db836a3c4ba522d884cd7c298a77",
+                "sha256:d05cef4a164b44ffda58200efcb22355350979e000828479971ebca49b82ddb1",
+                "sha256:d2f25c7f410338d31666d7ddedfa67570900e248b940d186b48461bd4e5569a1",
+                "sha256:d3b709d64b5cf064972b3763b47139e4a0dc4ae28a36437757f7663f67b99710",
+                "sha256:e32e3455db14602b6117f0f422f46bc297a3853ae2c322ecd1e2c4c04daf6ed5",
+                "sha256:ed53209b5f0f383acb49a927179fa51a6e2259878e164273ebc6815f3a752465",
+                "sha256:f605f348f4e6a2ba00acb3399c71d213b92f27f2383fc4abebf7a37368c12142",
+                "sha256:fcdb3755a7c355bc29df1b5e6fb8226d5c8b90551d202d69d0076a8a5649d68b"
             ],
             "index": "pypi",
-            "version": "==1.3.16"
+            "version": "==1.3.20"
         },
         "sqlalchemy-utils": {
             "hashes": [
-                "sha256:f268af5bc03597fe7690d60df3e5f1193254a83e07e4686f720f61587ec4493a"
+                "sha256:fb66e9956e41340011b70b80f898fde6064ec1817af77199ee21ace71d7d6ab0"
             ],
-            "version": "==0.36.3"
+            "version": "==0.36.8"
         },
         "sqlparse": {
             "hashes": [
-                "sha256:022fb9c87b524d1f7862b3037e541f68597a730a8843245c349fc93e1643dc4e",
-                "sha256:e162203737712307dfe78860cc56c8da8a852ab2ee33750e33aeadf38d12c548"
+                "sha256:017cde379adbd6a1f15a61873f43e8274179378e95ef3fede90b5aa64d304ed0",
+                "sha256:0f91fd2e829c44362cbcfab3e9ae12e22badaa8a29ad5ff599f9ec109f0454e8"
             ],
             "index": "pypi",
-            "version": "==0.3.1"
+            "version": "==0.4.1"
+        },
+        "threadpoolctl": {
+            "hashes": [
+                "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725",
+                "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b"
+            ],
+            "markers": "python_version >= '3.5'",
+            "version": "==2.1.0"
         },
         "tqdm": {
             "hashes": [
-                "sha256:00339634a22c10a7a22476ee946bbde2dbe48d042ded784e4d88e0236eca5d81",
-                "sha256:ea9e3fd6bd9a37e8783d75bfc4c1faf3c6813da6bd1c3e776488b41ec683af94"
+                "sha256:9ad44aaf0fc3697c06f6e05c7cf025dd66bc7bcb7613c66d85f4464c47ac8fad",
+                "sha256:ef54779f1c09f346b2b5a8e5c61f96fbcb639929e640e59f8cf810794f406432"
             ],
             "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
-            "version": "==4.45.0"
+            "version": "==4.51.0"
+        },
+        "typed-ast": {
+            "hashes": [
+                "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
+                "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+                "sha256:0d8110d78a5736e16e26213114a38ca35cb15b6515d535413b090bd50951556d",
+                "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
+                "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
+                "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+                "sha256:3742b32cf1c6ef124d57f95be609c473d7ec4c14d0090e5a5e05a15269fb4d0c",
+                "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
+                "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
+                "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
+                "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
+                "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
+                "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+                "sha256:7e4c9d7658aaa1fc80018593abdf8598bf91325af6af5cce4ce7c73bc45ea53d",
+                "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
+                "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+                "sha256:92c325624e304ebf0e025d1224b77dd4e6393f18aab8d829b5b7e04afe9b7a2c",
+                "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+                "sha256:b52ccf7cfe4ce2a1064b18594381bccf4179c2ecf7f513134ec2f993dd4ab395",
+                "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
+                "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
+                "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
+                "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
+                "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+                "sha256:d648b8e3bf2fe648745c8ffcee3db3ff903d0817a01a12dd6a6ea7a8f4889072",
+                "sha256:f208eb7aff048f6bea9586e61af041ddf7f9ade7caed625742af423f6bae3298",
+                "sha256:fac11badff8313e23717f3dada86a15389d0708275bddf766cca67a84ead3e91",
+                "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+                "sha256:fcf135e17cc74dbfbc05894ebca928ffeb23d9790b3167a674921db19082401f",
+                "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
+            ],
+            "version": "==1.4.1"
+        },
+        "typing-extensions": {
+            "hashes": [
+                "sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918",
+                "sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c",
+                "sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f"
+            ],
+            "version": "==3.7.4.3"
         }
     },
     "develop": {}

+ 1 - 2
cdplib/db_handlers/SQLHandler.py

@@ -508,7 +508,6 @@ class SQLHandler:
         :rtype: DataFrame
         '''
         try:
-            
             connection = self._engine.connect()
 
             data = pd.read_sql(sql=query,
@@ -516,7 +515,7 @@ class SQLHandler:
                                **read_sql_kwargs)
 
             connection.close()
-           
+
             return data
 
         except Exception as e:

+ 173 - 0
cdplib/fine_tuning/FineTunedClassiferCV.py

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Apr 23 08:51:53 2020
+
+@author: tanya
+
+@description: class for fine-tuning a sklearn classifier
+(optimizing the probability threshold)
+"""
+
+import pandas as pd
+import numpy as np
+
+from typing import Callable
+
+from sklearn.base import (BaseEstimator, ClassifierMixin,
+                          clone, MetaEstimatorMixin)
+
+from cdplib.log import Log
+
+from cdplib.utils.TyperConverter import TypeConverter
+
+
+class FineTunedClassifierCV(BaseEstimator, ClassifierMixin,
+                            MetaEstimatorMixin):
+    """
+    Probability threshold tuning for a given estimator.
+    Overrides the method predict of the given sklearn classifer
+    and returns predictions with the optimal value of
+    the probability threshold.
+
+    An object of this class can be passed to an sklearn Pipeline
+    """
+    def __init__(self, estimator, cost_func: Callable, greater_is_better: bool,
+                 cv=None, threshold_step: float = 0.1):
+        """
+        """
+        self.estimator = estimator
+
+        self.is_fitted = False
+
+        self.greater_is_better = greater_is_better
+
+        if cv is None:
+            self.cv = ...
+        else:
+            self.cv = cv
+
+        self.cost_func = cost_func
+
+        self.threshold_step = threshold_step
+
+        self.optimal_threshold = 0.5
+
+        self._logger = Log("FineTunedClassifyCV")
+
+    def _get_best_threshold(self, y_val: (pd.DataFrame, np.array),
+                            proba_pred: (pd.DataFrame, np.array)):
+        '''
+        '''
+        costs = {}
+
+        for t in np.arange(self.threshold_step, 1, self.threshold_step):
+            costs[t] = self.cost_func(y_val, (proba_pred >= t).astype(int))
+
+        if self.greater_is_better:
+            return max(costs, key=costs.get)
+        else:
+            return min(costs, key=costs.get)
+
+    def fit(self, X: (pd.DataFrame, np.array),
+            y: (pd.DataFrame, np.array) = None,
+            **fit_args):
+        """
+        """
+        X = TypeConverter().convert_to_ndarray(X)
+        if y is not None:
+            y = TypeConverter().convert_to_ndarray(X)
+
+        optimal_thrs_per_fold = []
+
+        for train_inds, val_inds in self.cv:
+            X_train, X_val = X[train_inds], X[val_inds]
+
+            if y is not None:
+                y_train, y_val = y[train_inds], y[val_inds]
+            else:
+                y_train, y_val = None, None
+
+            estimator = clone(fine_tuned_clf.estimator)
+
+            estimator.fit(X_train, y_train, **fit_args)
+
+            proba_pred = estimator.predict_proba(X_val)
+
+            optimal_thr = self._get_best_threshold(y_val, proba_pred)
+
+            optimal_thrs_per_fold.append(optimal_thr)
+
+        self.optimal_threshold = np.mean(optimal_thrs_per_fold)
+
+        self.estimator.fit(X, **fit_args)
+
+    def predict(self, X: (pd.DataFrame, np.array)) -> np.array:
+        """
+        """
+        if self.is_fitted:
+
+            proba_pred = self.estimator.predict_proba(X)
+
+            return (proba_pred >= self.optimal_threshold).astype(int)
+
+        else:
+            self._logger.warn("You should fit first")
+
+    def get_params(self):
+        """
+        """
+        params = self.estimator.get_params()
+
+        params.update({"cv": self.cv, "cost_func": self.cost_func})
+
+        return params
+
+    def set_params(self, **params: dict):
+        """
+        """
+        for param in params:
+            if param == "cv":
+                self.cv = params[param]
+                params.pop(param)
+
+            elif param == "cost_func":
+                self.cost_func = params[param]
+                params.pop(param)
+
+        self.estimator.set_params(**params)
+
+
+if __name__ == "__main__":
+    # test
+    from sklearn.datasets import load_iris
+    from sklearn.metrics import accuracy_score
+    import gc
+    from xgboost import XGBRFClassifier
+
+    data = load_iris()
+    X, y = data["data"], data["target"]
+    y = (y==1).astype(int)
+    del data
+    gc.collect()
+
+    # make a custom cv object
+    val_len = len(X)//10
+    split_inds = range(len(X)//2, len(X), val_len)
+
+    cv = []
+
+    for i in split_inds:
+        train_inds = list(range(i))
+        val_inds = list(range(i, i + val_len))
+        cv.append((train_inds, val_inds))
+
+    clf = XGBRFClassifier()
+
+    fine_tuned_clf = FineTunedClassifierCV(estimator=clf,
+                                           cv=cv,
+                                           greater_is_better=True,
+                                           cost_func=accuracy_score)
+
+    fine_tuned_clf.fit(X=X, y=y)
+

+ 47 - 51
cdplib/gridsearch/GridSearchPipelineSelector.py

@@ -14,16 +14,15 @@ Created on Wed Sep 30 14:15:17 2020
 """
 
 import os
-import sys
 import datetime
+import numpy as np
 from itertools import product
 from collections import ChainMap
 from sklearn.pipeline import Pipeline
+from typing import Callable, Optional, Literal, Dict, Union, List
 
 from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
 
-sys.path.append(os.getcwd())
-
 
 class GridSearchPipelineSelector(PipelineSelector):
     """
@@ -36,17 +35,19 @@ class GridSearchPipelineSelector(PipelineSelector):
      if needed.
     """
     def __init__(self,
-                 cost_func,
+                 cost_func: Union[Callable, str],
                  greater_is_better: bool,
                  trials_path: str,
-                 backup_trials_freq: int = 1,
-                 cross_val_averaging_func: callable = None,
-                 additional_metrics: dict = None,
-                 strategy_name: str = None,
-                 stdout_log_level: str = "INFO"
+                 backup_trials_freq: Optional[int] = None,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"
                  ):
         """
-        :param callable cost_func: function to minimize or maximize
+        ::param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
 
         :param bool greater_is_better: when True
             cost_func is maximized, else minimized.
@@ -56,25 +57,24 @@ class GridSearchPipelineSelector(PipelineSelector):
             select information about the obtained scores, score variations,
             and pipelines, and parameters tried out so far. If a trials object
             already exists at the given path, it is loaded and the
-            search is continued, else, the search is started from
-            the beginning.
+            search is continued, else, the search is started from scratch.
 
         :param backup_trials_freq: frequecy in interations (trials)
             of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
 
-        :param str log_path: Optional, when not provided logs to stdout.
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores.
+            Example different from the mean: mean - c*var.
 
-        :param callable averaging_func: optional,
-            when not provided set to mean. Function
-            to aggregate the cross-validated values of the cost function.
-            Classic situation is to take the mean,
-            another example is, for example mean() - c*var().
         :param additional_metics: dict of additional metrics to save
             of the form {"metric_name": metric} where metric is a Callable.
 
-        :param str strategy_name: a name might be asigned to the trials,
-            a strategy is defined by the data set, cv object, cost function.
-            When the strategy changes, one should start with new trials.
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
 
         :param str stdout_log_level: can be INFO, WARNING, ERROR
         """
@@ -99,7 +99,7 @@ class GridSearchPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def run_trials(self):
+    def run_trials(self) -> None:
         """
         """
         try:
@@ -115,22 +115,25 @@ class GridSearchPipelineSelector(PipelineSelector):
             # with all different combinations of
             # parameters for different pipelines
             # from the space definition.
-            space_unfolded = ({"name": pipeline_dist["name"],
-                               "pipeline": pipeline_dist["pipeline"],
+            space_unfolded = ({"name": param_dist["name"],
+                               "pipeline": param_dist["pipeline"],
                                "params": param_set}
-                              for pipeline_dist in self._space
+                              for param_dist in self._space
                               for param_set in
                               (dict(ChainMap(*tup)) for tup in
                                product(*[[{k: v} for v in
-                                          pipeline_dist["params"][k]]
-                                         for k in pipeline_dist["params"]])))
+                                          param_dist["params"][k]]
+                                         for k in param_dist["params"]])))
 
             for space_element in space_unfolded:
 
+                # uniquely identifies the current space element
                 trial_id = {"name": space_element["name"],
                             "params": space_element["params"],
                             "status": 'ok'}
 
+                # verify if the current pipline/parameters
+                # were already tested before
                 if trial_id in done_trial_ids:
                     continue
 
@@ -159,15 +162,12 @@ class GridSearchPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def number_of_trials(self) -> int:
+    def number_of_trials(self) -> Union[int, None]:
         """
         Number of trials already run in the current trials object
         """
         try:
-            if self._trials is None:
-                return 0
-            else:
-                return len(self._trials)
+            return len(self._trials)
 
         except Exception as e:
             err = ("Failed to retrieve the number of trials. "
@@ -176,11 +176,11 @@ class GridSearchPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial(self) -> dict:
+    def best_trial(self) -> Union[dict, None]:
         """
         """
         try:
-            assert(self._trials is not None),\
+            assert(len(self._trials) > 0),\
                 ("Trials object is empty. "
                  "Call run_trials method.")
 
@@ -193,11 +193,11 @@ class GridSearchPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial_score(self) -> float:
+    def best_trial_score(self) -> Union[float, None]:
         '''
         '''
         try:
-            assert(self._trials is not None),\
+            assert(len(self._trials) > 0),\
                 ("Trials object is empty. "
                  "Call run_trials method.")
 
@@ -210,11 +210,11 @@ class GridSearchPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial_score_variance(self) -> float:
+    def best_trial_score_variance(self) -> Union[float, None]:
         '''
         '''
         try:
-            assert(self._trials is not None),\
+            assert(len(self._trials) > 0),\
                 ("Trials object is empty. "
                  "Call run_trials method.")
 
@@ -227,11 +227,11 @@ class GridSearchPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial_pipeline(self) -> Pipeline:
+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
         '''
         '''
         try:
-            assert(self._trials is not None),\
+            assert(len(self._trials) > 0),\
                 ("Trials object is empty. "
                  "Call run_trials method.")
 
@@ -243,16 +243,14 @@ class GridSearchPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def get_n_best_trial_pipelines(self, n: int) -> list:
+    def get_n_best_trial_pipelines(self, n: int)\
+            -> Union[List[Pipeline], None]:
         """
         N best pipelines with corresponding
         best hyperparameters
         """
         try:
-            assert(isinstance(n, int)),\
-                "Parameter n must be an int"
-
-            assert(self._trials is not None),\
+            assert(len(self._trials) > 0),\
                 ("Trials object is empty. "
                  "Call run_trials method.")
 
@@ -266,17 +264,15 @@ class GridSearchPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> list:
+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
+            -> Union[Dict[str, List[Pipeline]], None]:
         """
         If the hyperparameter search is done over multiple
         pipelines, then returns n different pipeline-types
         with corresponding hyperparameters
         """
         try:
-            assert(isinstance(n, int)),\
-                "Parameter n must be an int"
-
-            assert(self._trials is not None),\
+            assert(len(self._trials) > 0),\
                 ("Trials object is empty. "
                  "Call run_trials method.")
 
@@ -295,7 +291,7 @@ class GridSearchPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def trials_to_excel(self, path: str):
+    def trials_to_excel(self, path: str) -> None:
         """
         Trials object in the shape of table written to excel,
         should contain the run number, pipeline (as str),

+ 2 - 4
cdplib/hyperopt/HyperoptPipelineSelection.py

@@ -480,8 +480,6 @@ class HyperoptPipelineSelection:
                         trials=self._trials,
                         max_evals=len(self._trials.trials) + niter)
 
-            # print('AAAA', str(niter))
-
             self._logger.info(
                     "Best score is {0} with variance {1}"
                     .format(
@@ -589,8 +587,8 @@ class HyperoptPipelineSelection:
                 losses = [self._ith_trial_loss(i)
                           for i in range(len(self._trials.trials))]
 
-            best_n_indices = [losses.index(l)
-                              for l in sorted(list(set(losses)))[:n]]
+            best_n_indices = [losses.index(ll)
+                              for ll in sorted(list(set(losses)))[:n]]
 
             return [self._ith_trial_pipeline(i) for i in best_n_indices]
         else:

+ 60 - 60
cdplib/hyperopt/HyperoptPipelineSelector.py

@@ -21,8 +21,6 @@ from copy import deepcopy
 
 import datetime
 
-from typing import Callable
-
 import pandas as pd
 import numpy as np
 
@@ -30,7 +28,10 @@ from sklearn.pipeline import Pipeline
 
 from hyperopt import fmin, tpe, rand, Trials, space_eval
 
-from cdplib.pipeline_selector.PipelineSelector import PipelineSelector
+from cdplib.pipeline_selector.PipelineSelector import PipelineSelector,\
+     SpaceElementType
+
+from typing import Callable, Optional, Literal, Dict, Union, List
 
 
 class HyperoptPipelineSelector(PipelineSelector):
@@ -52,16 +53,18 @@ class HyperoptPipelineSelector(PipelineSelector):
     a better pipeline was found.
     """
     def __init__(self,
-                 cost_func: (Callable, str),
+                 cost_func: Union[Callable, str],
                  greater_is_better: bool,
                  trials_path: str,
-                 backup_trials_freq: int = None,
-                 cross_val_averaging_func: Callable = None,
-                 additional_metrics: dict = None,
-                 strategy_name: str = None,
-                 stdout_log_level: str = "INFO"):
+                 backup_trials_freq: Optional[int] = None,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"):
         """
-        :param callable cost_func: function to minimize or maximize
+        param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
 
         :param bool greater_is_better: when True
             cost_func is maximized, else minimized.
@@ -71,25 +74,24 @@ class HyperoptPipelineSelector(PipelineSelector):
             select information about the obtained scores, score variations,
             and pipelines, and parameters tried out so far. If a trials object
             already exists at the given path, it is loaded and the
-            search is continued, else, the search is started from
-            the beginning.
+            search is continued, else, the search is started from scratch.
 
         :param backup_trials_freq: frequecy in interations (trials)
             of saving the trials object at the trials_path.
+            if None, the trials object is backed up avery time
+            the score improves.
 
-        :param str log_path: Optional, when not provided logs to stdout.
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores.
+            Example different from the mean: mean - c*var.
 
-        :param callable averaging_func: optional,
-            when not provided set to mean. Function
-            to aggregate the cross-validated values of the cost function.
-            Classic situation is to take the mean,
-            another example is, for example mean() - c*var().
         :param additional_metics: dict of additional metrics to save
             of the form {"metric_name": metric} where metric is a Callable.
 
-        :param str strategy_name: a name might be asigned to the trials,
-            a strategy is defined by the data set, cv object, cost function.
-            When the strategy changes, one should start with new trials.
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
 
         :param str stdout_log_level: can be INFO, WARNING, ERROR
         """
@@ -116,30 +118,19 @@ class HyperoptPipelineSelector(PipelineSelector):
 
     def run_trials(self,
                    niter: int,
-                   algo: callable = tpe.suggest):
+                   algo: Literal[tpe.suggest, rand.suggest] = tpe.suggest)\
+            -> None:
         '''
         Method performing the search of the best pipeline in the given space.
         Calls fmin function from the hyperopt library to minimize the output of
         _objective.
 
         :params int niter: number of search iterations
-        :param callable algo: now can only take values tpe for a tree-based
-            random search or random for random search
+        :param algo: now can only take supported by the hyperopt library.
+            For now these are tpe.suggest for a tree-based bayesian search
+            or rad.suggest for randomized search
         '''
         try:
-            assert(self.attached_space),\
-                ("Space must be attach to be able to "
-                 "retrieve this information.")
-
-            assert(isinstance(niter, int)),\
-                "Parameter 'niter' must be of int type"
-
-            # right now only two algorithms are provided by hyperopt
-            assert(algo in [tpe.suggest, rand.suggest]),\
-                ("Parameter 'algo' can be now only tpe or random. "
-                 "If other algorithms have been developped by "
-                 "by hyperopt, plased add them to the list.")
-
             self._trials = self._trials or Trials()
 
             self._logger.info(("Starting {0} iterations of search "
@@ -171,11 +162,13 @@ class HyperoptPipelineSelector(PipelineSelector):
             self._backup_trials()
 
         except Exception as e:
-            raise ValueError(("Failed to select best "
-                             "pipeline! Exit with error: {}").format(e))
+            err = ("Failed to select best "
+                   "pipeline! Exit with error: {}").format(e)
+
+            self._logger.log_and_raise_error(err)
 
     @property
-    def number_of_trials(self) -> int:
+    def number_of_trials(self) -> Union[int, None]:
         """
         :return: number of trials run so far
             with the given Trials object
@@ -187,9 +180,11 @@ class HyperoptPipelineSelector(PipelineSelector):
         except Exception as e:
             err = ("Failed to retrieve the number of trials. "
                    "Exit with error {}".format(e))
+
             self._logger.log_and_raise_error(err)
 
-    def _get_space_element_from_trial(self, trial) -> dict:
+    def _get_space_element_from_trial(self, trial: Dict)\
+            -> Union[Dict[SpaceElementType], None]:
         """
         Hyperopt trials object does not contain the space
              elements that result in the corresponding trials.
@@ -224,7 +219,8 @@ class HyperoptPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def _get_space_element_from_index(self, i: int) -> dict:
+    def _get_space_element_from_index(self, i: int)\
+            -> Union[Dict[SpaceElementType], None]:
         """
         Gets the space element of shape
         {"name": NAME, "params": PARAMS, "pipeline": PIPELINE}
@@ -243,7 +239,7 @@ class HyperoptPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def _get_pipeline_from_index(self, i: int) -> Pipeline:
+    def _get_pipeline_from_index(self, i: int) -> Union[Pipeline, None]:
         """
         Gets a pipeline with set parameters from the trial number i
         """
@@ -259,16 +255,19 @@ class HyperoptPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial(self) -> dict:
+    def best_trial(self) -> Union[Dict, None]:
         """
         :return: dictionary with the summary of the best trial
             and space element (name, pipeline, params)
             resulting in the best trial
         """
         if len(self._trials.trials) == 0:
+
             self._logger.log_and_throw_warning("Trials object is empty")
             return {}
+
         else:
+
             try:
                 best_trial = deepcopy(self._trials.best_trial)
 
@@ -297,7 +296,7 @@ class HyperoptPipelineSelector(PipelineSelector):
                 self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial_score(self) -> float:
+    def best_trial_score(self) -> Union[float, None]:
         """
         """
         try:
@@ -313,7 +312,7 @@ class HyperoptPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial_score_variance(self) -> float:
+    def best_trial_score_variance(self) -> Union[float, None]:
         """
         """
         try:
@@ -329,7 +328,7 @@ class HyperoptPipelineSelector(PipelineSelector):
             self._logger.log_and_raise_error(err)
 
     @property
-    def best_trial_pipeline(self) -> Pipeline:
+    def best_trial_pipeline(self) -> Union[Pipeline, None]:
         """
         """
         try:
@@ -344,15 +343,13 @@ class HyperoptPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def get_n_best_trial_pipelines(self, n: int) -> list:
+    def get_n_best_trial_pipelines(self, n: int)\
+            -> Union[List[Pipeline], None]:
         """
         :return: the list of n best pipelines
         documented in trials
         """
         try:
-            assert(isinstance(n, int)),\
-                "Parameter n must be an int"
-
             if len(self._trials.trials) == 0:
                 return []
             else:
@@ -369,15 +366,13 @@ class HyperoptPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def get_n_best_trial_pipelines_of_each_type(self, n: int) -> dict:
+    def get_n_best_trial_pipelines_of_each_type(self, n: int)\
+            -> Union[Dict[str, List[Pipeline]], None]:
         """
         :return: a dictiionry where keys are pipeline names,
         and values are lists of best pipelines with this name
         """
         try:
-            assert(isinstance(n, int)),\
-                "Parameter n must be an int"
-
             scores = [trial["result"]["score"]
                       for trial in self._trials.trials]
 
@@ -401,7 +396,7 @@ class HyperoptPipelineSelector(PipelineSelector):
 
             self._logger.log_and_raise_error(err)
 
-    def trials_to_excel(self, path: str = None):
+    def trials_to_excel(self, path: str = None) -> None:
         """
         Saves an excel file with pipeline names, scores,
         parameters, and timestamps.
@@ -431,8 +426,8 @@ if __name__ == '__main__':
     from sklearn.datasets import load_breast_cancer
     from cdplib.log import Log
     from cdplib.db_handlers import MongodbHandler
-    # from cdplib.hyperopt.space_sample import space
-    from cdplib.hyperopt.composed_space_sample import space
+    from cdplib.hyperopt.space_sample import space
+    # from cdplib.hyperopt.composed_space_sample import space
 
     trials_path = "hyperopt_trials_TEST.pkl"
     additional_metrics = {"precision": precision_score}
@@ -472,9 +467,14 @@ if __name__ == '__main__':
 
     try:
 
+        # TODO: this line causes a pytype to throw not-callable error
+        # works fine with pytype on other class methods.
         save_method = MongodbHandler().insert_data_into_collection
         save_kwargs = {'collection_name': collection_name}
 
+        # save_method = pd.DataFrame.to_excel()
+        # save_kwargs = {'excel_writer': "TEST.xlsx"}
+
         hs.configer_summary_saving(save_method=save_method,
                                    kwargs=save_kwargs)
 
@@ -482,8 +482,8 @@ if __name__ == '__main__':
 
     except Exception as e:
 
-        logger.warn(("Could not configure summary saving in mongo. "
-                     "Exit with error: {}".format(e)))
+        logger.warning(("Could not configure summary saving in mongo. "
+                        "Exit with error: {}".format(e)))
 
     hs.run_trials(niter=10)
 

+ 208 - 0
cdplib/ml_validation/CVComposer.py

@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Dec  9 10:27:39 2020
+
+@author: tanya
+"""
+
+from typing import Union, Iterable, Tuple, List, NewType
+import pandas as pd
+import numpy as np
+from itertools import accumulate, repeat, takewhile, chain
+
+from cdplib.log import Log
+
+
+CVType = NewType("CVType", Iterable[Tuple[List]])
+
+DataSetType = NewType("DataSetType",
+                      Union[pd.DataFrame, pd.Sereis, np.ndarray, List])
+
+
+class CVComposer:
+    """
+    Groups methods for composing cv objects
+    that follow standards from sklearn,
+    these cv objects can be passed to algorithms like gridsearch, etc
+    """
+    def __init__(self):
+        """
+        """
+        self._logger = Log("CVComposer: ")
+
+    def dummy_cv(
+            self,
+            train_set_size: Union[int, None] = None,
+            train_index: Union[pd.Series, np.ndarray, None] = None,
+            test_set_size: Union[int, None] = None,
+            test_index: DataSetType = None) -> CVType:
+        """
+        """
+        assert((train_index is None) != (train_set_size is None)),\
+            "Set train_index or train_set_size"
+
+        assert((test_index is None) != (test_set_size is None)),\
+            "Set train_index or train_set_size"
+
+        train_index = train_index if (train_index is not None)\
+            else list(range(train_set_size))
+
+        test_index = test_index if (test_index is not None)\
+            else list(range(train_set_size, train_set_size + test_set_size))
+
+        return [(train_index, test_index)]
+
+    def dummy_cv_and_concatenated_data_set(
+            self,
+            X_train: DataSetType,
+            y_train: Union[DataSetType, None] = None,
+            X_test: DataSetType,
+            y_test: Union[DataSetType, None] = None)\
+            -> Tuple[DataSetType, DataSetType, CVType]:
+        """
+        """
+        assert((y_test is None) == (y_train is None))
+
+        use_index = (isinstance(X_train, pd.DataFrame) and
+                     isinstance(X_test, pd.DataFrame) and
+                     (len(set(X_train.index) and set(X_test.index)) == 0))
+
+        if use_index:
+
+            cv = self.dummy_cv(train_index=X_train.index,
+                               test_index=X_test.index)
+
+            X = pd.concat([X_train, X_test], ignore_index=False, axis=0)
+
+        else:
+            cv = self.dummy_cv(train_size=len(X_train),
+                               test_size=len(X_test))
+
+            X = np.concatenate([X_train, X_test])
+
+        use_target_index = use_index and (
+                    isinstance(y_train, pd.Series) and
+                    isinstance(y_test, pd.Series) and
+                    (X_train.index.equals(y_train.index)) and
+                    (X_test.index.equals(y_test.index)))
+
+        if use_target_index:
+
+            y = pd.concat([y_train, y_test], ignore_index=False, axis=0)
+
+        else:
+
+            y = np.concatenate([y_train, y_test]) if (y_train is not None)\
+                else None
+
+        result_to_np = (
+            (isinstance(X_train, pd.DataFrame) !=
+             isinstance(X_test, pd.DataFrame)) or
+            (isinstance(X_train, pd.DataFrame)) and
+            (len(set(X_train.index) and set(X_test.index)) != 0))
+
+        if result_to_np:
+            self._logger.log_and_throw_warning(
+                    "The concatenated dataframe is converted to numpy")
+
+        return cv, X, y
+
+    def expanding_cv(self, test_proportion: float,
+                     start_train_proportion: float,
+                     step_proportion: float = None,
+                     expanding_test_size: bool = False,
+                     data_set_size: Union[float, None] = None,
+                     index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Union[Iterable[Tuple[List]], None]:
+        """
+        """
+        try:
+            assert((index is None) != (data_set_size is None)),\
+                "Set index or data_set_size"
+
+            index = pd.Series(index) if (index is not None)\
+                else pd.Series(range(data_set_size))
+
+            data_set_size = data_set_size or len(index)
+
+            start_train_size = int(start_train_proportion * data_set_size)
+            step_size = int(step_proportion * data_set_size)
+
+            test_size = int(test_proportion * data_set_size)
+
+            train_inds_set = (list(range(train_size))
+                              for train_size in
+                              takewhile(
+                                      lambda x: x <= data_set_size - test_size,
+                                      accumulate(repeat(start_train_size),
+                                                 lambda x, _: x + step_size)))
+
+            for train_inds in train_inds_set:
+
+                if expanding_test_size:
+
+                    yield (index[train_inds],
+                           index[train_inds[-1] + 1:
+                                 train_inds[-1] + 1
+                                 + int(test_proportion*len(train_inds))])
+
+                else:
+
+                    yield (index[train_inds],
+                           index[train_inds[-1] + 1:
+                                 train_inds[-1] + 1 + test_size])
+
+        except Exception as e:
+            self._logger.log_and_raise_error(("Failed to make expanding cv. "
+                                              "Exit with error: {}".format(e)))
+
+    def sliding_window_cv(
+        self,
+        test_proportion: float,
+        train_proportion: float,
+        step_proportion: float = None,
+        data_set_size: Union[float, None] = None,
+        index: Union[pd.Series, np.ndarray, list, None] = None)\
+            -> Union[Iterable[Tuple[List]], None]:
+        """
+        """
+        try:
+            assert((index is None) != (data_set_size is None)),\
+                "Set index or data_set_size"
+
+            index = pd.Series(index) if (index is not None)\
+                else pd.Series(range(data_set_size))
+
+            data_set_size = data_set_size or len(index)
+
+            train_size = int(train_proportion * data_set_size)
+            test_size = int(test_proportion * data_set_size)
+            step_size = int(step_proportion * data_set_size)
+
+            train_sizes = takewhile(lambda x: x <= data_set_size - test_size,
+                                    accumulate(repeat(train_size),
+                                               lambda x, _: x + step_size))
+
+            train_starts = takewhile(lambda x: x <= data_set_size
+                                     - train_size - test_size,
+                                     accumulate(repeat(step_size),
+                                                lambda x, _: x + step_size))
+
+            train_starts = chain([0], train_starts)
+
+            train_inds_set = list(range(train_start, train_size)
+                                  for train_start, train_size in
+                                  zip(train_starts, train_sizes))
+
+            cv = ((index[train_inds], index[train_inds[-1] + 1:
+                                            train_inds[-1] + 1 + test_size])
+                  for train_inds in train_inds_set)
+
+            return cv
+
+        except Exception as e:
+            self._logger.log_and_raise_error(
+                    ("Failed to make sliding window cv. "
+                     "Exit with error: {}".format(e)))
+

+ 0 - 0
cdplib/ml_validation/__init__.py


+ 491 - 0
cdplib/ml_validation/cross_validate_with_fine_tuning.py

@@ -0,0 +1,491 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Oct 29 13:58:23 2020
+
+@author: tanya
+
+
+@description:
+
+* Input:
+    - pipeline/hyperparameter space
+    - data_train
+    - cv
+    - cv_folds
+
+* For each pipeline:
+
+    -> Split data_train into folds according to cv
+
+     -> For each fold:
+
+         => get data_train_fold, data_test_fold, cv_fold
+
+         => split data_train_fold into subfolds according to cv_fold
+
+         => For each subfold:
+
+             ==> get data_train_subfold, data_test_subfold
+
+             ==> train pipeline on data_train_subfold
+
+             ==> find best_threshold_subfold on data_test_subfold
+
+        => Find averaged_threshold_fold averaged over best_threshold_subfold
+
+        => train pipeline on data_train_fold
+
+        => find score_fold on data_test_fold with proba_threshold_fold
+
+        => find best_threshold_fold on data_test_fold
+
+    -> find score averaged over score_fold
+
+    -> find averaged_threshold averaged over best_threshold_fold
+
+* choose (pipeline/hyperparameters, threshold) in the space with best score
+
+"""
+
+import pandas as pd
+import numpy as np
+from itertools import zip_longest
+from typing import Union, Callable, Dict, Iterable, Tuple, List
+from copy import deepcopy
+from itertools import accumulate, repeat, takewhile, chain
+
+from sklearn.model_selection import StratifiedKFold
+
+from cdplib.log import Log
+
+
+
+
+
+aa = make_sliding_window_cv(data_set_size=50,
+                            test_proportion=0.1,
+                            train_proportion=0.6,
+                            step_proportion=0.1)
+
+aa = list(aa)
+
+aa = make_sliding_window_cv(test_proportion=0.1,
+                            train_proportion=0.6,
+                            step_proportion=0.05,
+                            index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
+
+aa = list(aa)
+
+
+# TODO: write with yield !!!!
+
+def make_nested_expanding_cv(
+        test_proportion: float,
+        start_train_proportion: float,
+        step_proportion: float = None,
+        expanding_test_size: bool = False,
+        data_set_size: Union[float, None] = None,
+        index: Union[pd.Series, np.ndarray, list, None] = None)\
+        -> Iterable[Tuple[List]]:
+    """
+    """
+    logger = Log("make_nested_expanding_cv:")
+
+    try:
+        cv = make_expanding_cv(test_proportion=test_proportion,
+                               start_train_proportion=start_train_proportion,
+                               step_proportion=step_proportion,
+                               expanding_test_size=expanding_test_size,
+                               data_set_size=data_set_size,
+                               index=index)
+
+        nested_cv = []
+
+        for train_inds, test_inds in cv:
+
+            fold_index = train_inds if index is not None\
+                else None
+
+            fold_size = len(train_inds) if index is None else None
+
+            fold_cv = make_expanding_cv(
+                    test_proportion=test_proportion,
+                    start_train_proportion=start_train_proportion,
+                    step_proportion=step_proportion,
+                    expanding_test_size=expanding_test_size,
+                    data_set_size=fold_size,
+                    index=fold_index)
+
+            nested_cv.append(list(fold_cv))
+
+        return nested_cv
+
+    except Exception as e:
+        logger.log_and_raise_error(("Failed to make nested expanding cv. "
+                                    "Exit with error: {}".format(e)))
+
+
+
+
+for train_inds, test_inds in aa:
+    print(len(test_inds)/(len(train_inds) + len(test_inds)))
+    print(len(test_inds)/50)
+
+aaa = list(aaa)
+
+for aaa_cv in aaa:
+    for train_inds, test_inds in aaa_cv:
+        print(len(test_inds)/(len(train_inds) + len(test_inds)))
+        print(len(test_inds)/50)
+
+aaa = make_nested_expanding_cv(#data_set_size=50,
+                               test_proportion=0.1,
+                               start_train_proportion=0.6,
+                               step_proportion=0.1,
+                               index=pd.date_range(start=pd.to_datetime("2020-01-01"), periods=50))
+
+aaa = list(aaa)
+
+
+
+
+
+def cv_slice_dataset(X, y, train_inds, test_inds)\
+        -> Tuple[Union[pd.DataFrame, np.ndarray],
+                 Union[pd.Series, np.ndarray]]:
+    """
+    """
+    if isinstance(X, pd.DataFrame):
+        X_train = X.loc[train_inds]
+        X_val = X.loc[test_inds]
+    else:
+        X_train = X[train_inds]
+        X_val = X[test_inds]
+
+    if y is not None:
+        y_train = y[train_inds]
+        y_val = y[test_inds]
+
+    return X_train, X_val, y_train, y_val
+
+
+def get_optimal_proba_threshold(score_func: Callable,
+                                y_true: Union[pd.Series, np.ndarray],
+                                proba: Union[pd.Series, np.ndarray],
+                                threshold_set: Union[Iterable, None] = None):
+    """
+    """
+    scores = {}
+
+    if threshold_set is None:
+        threshold_set = np.arange(0, 1, 0.1)
+
+    for threshold in threshold_set:
+
+        y_pred = (proba >= threshold).astype(int)
+
+        scores[threshold] = score_func(y_true, y_pred)
+
+    return max(scores, key=scores.get)
+
+
+def cross_validate_with_optimal_threshold(
+        estimator: object,
+        score_func: Callable,
+        X_train: Union[pd.DataFrame, np.ndarray],
+        y_train: Union[pd.Series, np.ndarray, None] = None,
+        X_val: Union[pd.DataFrame, np.ndarray, None] = None,
+        y_val: Union[pd.Series, np.ndarray, None] = None,
+        X_val_threshold: Union[pd.DataFrame, np.ndarray, None] = None,
+        y_val_threshold: Union[pd.Series, np.ndarray, None] = None,
+        cv: Union[Iterable, int, None] = None,
+        cv_threshold: Union[Iterable, int, None] = None,
+        additional_metrics: Union[Dict[str, Callable], None] = None,
+        threshold_set: Union[Iterable, None] = None,
+        scores: Dict = None)\
+            -> Dict:
+    """
+    """
+    logger = Log("cross_validate_with_optimal_threshold:")
+
+    X_train = deepcopy(X_train)
+    y_train = deepcopy(y_train)
+    X_val = deepcopy(X_val)
+    y_val = deepcopy(y_val)
+    X_val_threshold = deepcopy(X_val_threshold)
+    y_val_threshold = deepcopy(y_val_threshold)
+
+    scores = scores or {"test_threshold": [],
+                        "test_score": [],
+                        "train_score": []}
+
+    additional_metrics = additional_metrics or {}
+
+    for metric_name, metric in additional_metrics.items():
+        if "test_" + metric_name not in scores:
+            scores["test_" + metric_name] = []
+            scores["train_" + metric_name] = []
+
+    if cv is None:
+
+        # test score is calculated on X_vals
+
+        assert((X_val is not None) and (y_val is not None)),\
+            "Validation set must be set"
+
+        if cv_threshold is None:
+
+            refit = (X_val_threshold is not None)
+
+            # if a validation set for proba threshold tuning is not given,
+            # we use the validation set on which we calculate the test score
+            # (this might lead to overfitting)
+
+            X_val_threshold = X_val_threshold if refit else deepcopy(X_val)
+            y_val_threshold = y_val_threshold if refit else deepcopy(y_val)
+
+            cv_threshold, X_train, y_train = make_dummy_cv(
+                    X_train=X_train,
+                    y_train=y_train,
+                    X_val=X_val_threshold,
+                    y_val=y_val_threshold)
+        else:
+
+            # if cv_threshold is given, we find the optimal threshold
+            # on each fold and output the average value for the threshold
+
+            if (X_val_threshold is not None):
+                logger.log_and_throw_warning((
+                        "X_val_threshold is set "
+                        "but cv_threshold will be used"))
+
+            if isinstance(cv_threshold, int):
+                cv_threshold = StratifiedKFold(n_splits=cv_threshold)\
+                    .split(X=X_train, y=y_train)
+
+            refit = True
+
+        thresholds = []
+
+        for train_inds, val_inds in cv_threshold:
+
+            print("----- In cv threshold fold")
+
+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
+                cv_slice_dataset(X=X_train,
+                                 y=y_train,
+                                 train_inds=train_inds,
+                                 test_inds=val_inds)
+
+            estimator.fit(X_train_fold, y_train_fold)
+
+            proba_val = estimator.predict_proba(X_val_fold)[:, 1]
+
+            threshold = get_optimal_proba_threshold(score_func=score_func,
+                                                    y_true=y_val_fold,
+                                                    proba=proba_val)
+
+            thresholds.append(threshold)
+
+            print("----- Threshold:", threshold)
+
+        scores["test_threshold"].append(np.mean(thresholds))
+
+        if refit:
+
+            estimator.fit(X_train, y_train)
+
+            proba_val = estimator.predict_proba(X_val)[:, 1]
+
+        proba_train = estimator.predict_proba(X_train)[:, 1]
+
+        pred_train = (proba_train >= threshold)
+        pred_val = (proba_val >= threshold)
+
+        train_score = score_func(y_train, pred_train)
+        test_score = score_func(y_val, pred_val)
+
+        for metric_name, metric in additional_metrics.items():
+            scores["train_" + metric_name].append(metric(y_train, pred_train))
+            scores["test_" + metric_name].append(metric(y_val, pred_val))
+
+        scores["train_score"].append(train_score)
+        scores["test_score"].append(test_score)
+
+        return scores
+
+    else:
+
+        if isinstance(cv, int):
+            cv = StratifiedKFold(n_splits=cv).split(X=X_train, y=y_train)
+
+        cv_threshold = cv_threshold or []
+
+        for (train_inds, val_inds), cv_fold in zip_longest(cv, cv_threshold):
+
+            print("=== In cv fold")
+
+            X_train_fold, X_val_fold, y_train_fold, y_val_fold =\
+                cv_slice_dataset(X=X_train,
+                                 y=y_train,
+                                 train_inds=train_inds,
+                                 test_inds=val_inds)
+
+            scores = cross_validate_with_optimal_threshold(
+                    estimator=estimator,
+                    score_func=score_func,
+                    X_train=X_train_fold,
+                    y_train=y_train_fold,
+                    X_val=X_val_fold,
+                    y_val=y_val_fold,
+                    cv_threshold=cv_fold,
+                    additional_metrics=additional_metrics,
+                    threshold_set=threshold_set,
+                    scores=scores)
+
+            print("=== scores:", scores)
+
+        return scores
+
+
+if __name__ == "__main__":
+
+    from sklearn.metrics import accuracy_score, precision_score
+    from sklearn.datasets import load_breast_cancer
+    from xgboost import XGBRFClassifier
+    from sklearn.model_selection import train_test_split
+
+    data_loader = load_breast_cancer()
+
+    X = data_loader["data"]
+    y = data_loader["target"]
+
+    X_train, X_val, y_train, y_val = train_test_split(X, y)
+
+    estimator = XGBRFClassifier()
+
+    score_func = accuracy_score
+
+    additional_metrics = {"precision": precision_score}
+
+    averaged_scores = []
+    averaged_thresholds = []
+
+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold=None\n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=None,
+            y_val_threshold=None,
+            cv=None,
+            cv_threshold=None,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    X_train, X_val_threshold, y_train, y_val_threshold =\
+        train_test_split(X_train, y_train)
+
+    print("\nTesting cv=None, cv_threshold=None, X_val_threshold\n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=None,
+            cv_threshold=None,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=None, cv_threshold=3 \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=None,
+            cv_threshold=3,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=3, cv_threshold=None \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=3,
+            cv_threshold=None,
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    print("\n ########################################################## \n")
+
+    print("\nTesting cv=3, cv_threshold=[3, 3, 3] \n")
+
+    scores = cross_validate_with_optimal_threshold(
+            estimator=estimator,
+            score_func=accuracy_score,
+            X_train=X_train,
+            y_train=y_train,
+            X_val=X_val,
+            y_val=y_val,
+            X_val_threshold=X_val_threshold,
+            y_val_threshold=y_val_threshold,
+            cv=3,
+            cv_threshold=[3, 3, 3],
+            additional_metrics=additional_metrics)
+
+    print("\nScores:", scores)
+
+    averaged_scores.append(np.mean(scores["test_score"]))
+    averaged_thresholds.append(np.mean(scores["test_threshold"]))
+
+    print("\n ########################################################## \n")
+
+    # TODO: check overwriting X_train,
+    # additional metrics append instead of overwrite
+    # check the length of cv_threshold
+    # test custom cv, cv_threshold
+
+    print("\n Averaged test score:", averaged_scores)
+    print("\n Averaged threshold:", averaged_thresholds)

+ 97 - 0
cdplib/ml_validation/expanding_cv.py

@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Dec  9 09:55:52 2020
+
+@author: tanya
+"""
+
+from typing import Union, Iterable, Tuple, List
+import pandas as pd
+import numpy as np
+from itertools import accumulate, repeat, takewhile
+
+from cdplib.log import Log
+
+
+def make_expanding_cv(test_proportion: float,
+                      start_train_proportion: float,
+                      step_proportion: float = None,
+                      expanding_test_size: bool = False,
+                      data_set_size: Union[float, None] = None,
+                      index: Union[pd.Series, np.ndarray, list, None] = None)\
+        -> Union[Iterable[Tuple[List]], None]:
+    """
+
+    """
+    logger = Log("make_expanding_cv:")
+
+    try:
+        assert((index is None) != (data_set_size is None)),\
+            "Set index or data_set_size"
+
+        index = index if (index is not None)\
+            else pd.Series(range(data_set_size))
+
+        data_set_size = data_set_size or len(index)
+
+        start_train_size = int(start_train_proportion * data_set_size)
+        step_size = int(step_proportion * data_set_size)
+
+        test_size = int(test_proportion * data_set_size)
+
+        train_inds_set = (list(range(train_size))
+                          for train_size in
+                          takewhile(
+                                  lambda x: x <= data_set_size - test_size,
+                                  accumulate(repeat(start_train_size),
+                                             lambda x, _: x + step_size)))
+
+        for train_inds in train_inds_set:
+
+            if expanding_test_size:
+
+                yield (index[train_inds],
+                       index[train_inds[-1] + 1:
+                             train_inds[-1] + 1
+                             + int(test_proportion*len(train_inds))])
+
+            else:
+
+                yield (index[train_inds],
+                       index[train_inds[-1] + 1:
+                             train_inds[-1] + 1 + test_size])
+
+    except Exception as e:
+        logger.log_and_raise_error(("Failed to make expanding cv. "
+                                    "Exit with error: {}".format(e)))
+
+
+if __name__ == "__main__":
+
+    logger = Log("Test_expanding_cv: ")
+
+    logger.info("Start Testing")
+
+    logger.info("Testing expanding cv: ")
+
+    cv = make_expanding_cv(data_set_size=50,
+                           test_proportion=0.1,
+                           start_train_proportion=0.6,
+                           step_proportion=0.1,
+                           expanding_test_size=True)
+
+    cv = list(cv)
+
+    logger.info("Testing expanding cv with datetime index")
+
+    cv = make_expanding_cv(
+            test_proportion=0.1,
+            start_train_proportion=0.6,
+            step_proportion=0.1,
+            index=pd.date_range(start=pd.to_datetime("2020-01-01"),
+                                periods=50))
+
+    cv = list(cv)
+
+    logger.info("Finish testing")

+ 235 - 228
cdplib/pipeline_selector/PipelineSelector.py

@@ -24,8 +24,10 @@ import time
 import datetime
 import numpy as np
 import pandas as pd
+from copy import deepcopy
 from abc import ABC, abstractmethod, abstractproperty
-from typing import Callable
+from typing import Callable, Optional, TypedDict,\
+    Literal, Dict, Iterable, List, Tuple, Union
 import functools
 from sklearn.pipeline import Pipeline
 from sklearn.model_selection import cross_validate as sklearn_cross_validation
@@ -34,10 +36,17 @@ from hyperopt import STATUS_OK, STATUS_FAIL
 from cdplib.log import Log
 from cdplib.utils import ExceptionsHandler
 from cdplib.utils import LoadingUtils
+from cdplib.ml_validation import CVComposer
 
 sys.path.append(os.getcwd())
 
 
+class SpaceElementType(TypedDict):
+    name: str
+    pipeline: Pipeline
+    params: dict
+
+
 class PipelineSelector(ABC):
     """
     An abstract class for selecting a machine learning
@@ -53,16 +62,20 @@ class PipelineSelector(ABC):
     Children classes: hyperopt and custom gridsearch.
     """
     def __init__(self,
-                 cost_func: (Callable, str),
+                 cost_func: Union[Callable, str],
                  greater_is_better: bool,
                  trials_path: str,
-                 backup_trials_freq: int = None,
-                 cross_val_averaging_func: Callable = None,
-                 additional_metrics: dict = None,
-                 strategy_name: str = None,
-                 stdout_log_level: str = "INFO"):
+                 backup_trials_freq: Optional[int] = None,
+                 cross_val_averaging_func: Callable = np.mean,
+                 additional_metrics: Optional[Dict[str, Callable]] = None,
+                 additional_averaging_funcs:
+                     Optional[Dict[str, Callable]] = None,
+                 strategy_name: Optional[str] = None,
+                 stdout_log_level: Literal["INFO", "WARNING", "ERROR"]
+                 = "INFO"):
         """
         :param Callable cost_func: function to minimize or maximize
+            over the elements of a given (pipeline/hyperparameter) space
 
         :param bool greater_is_better: when True
             cost_func is maximized, else minimized.
@@ -72,62 +85,42 @@ class PipelineSelector(ABC):
             select information about the obtained scores, score variations,
             and pipelines, and parameters tried out so far. If a trials object
             already exists at the given path, it is loaded and the
-            search is continued, else, the search is started from
-            the beginning.
+            search is continued, else, the search is started from scratch.
 
         :param backup_trials_freq: frequecy in interations (trials)
             of saving the trials object at the trials_path.
             if None, the trials object is backed up avery time
             the score improves.
 
-        :param str log_path: Optional, when not provided logs to stdout.
+        :param Callable cross_val_averaging_func: Function to aggregate
+            the cross-validation scores of the cost_func.
+            Example different from the mean: mean - c*var.
+
+        :param additional_metics: dict of additional metrics to keep track of
+            in the trials of the form {"metric_name": metric}.
 
-        :param Callable cross_val_averaging_func: optional,
-            when not provided set to mean. Function
-            to aggregate the cross-validated values of the cost function.
-            Classic situation is to take the mean,
-            another example is, for example mean() - c*var().
+        :param additional_averaging_funcs: functions used to aggregate
+            the output of the cross_validate function.
+            The output always contains the scores of the cost_func,
+            additional_metrics (if it is not empty),
+            but it can also contain additional information
+            (like probability threshold for example)
+            if different from cross_val_averaging_func.
+            Of the form {"metric_name": averaging_func}
 
-        :param additional_metics: dict of additional metrics to save
-            of the form {"metric_name": metric} where metric is a Callable.
+            Remark:
 
-        :param str strategy_name: a name might be asigned to the trials,
-            a strategy is defined by the data set, cv object, cost function.
-            When the strategy changes, one should start with new trials.
+        :param str strategy_name:
+            a strategy is defined by the data set (columns/features and rows),
+            cv object, cost function.
+            When the strategy changes, one must start with new trials.
 
         :param str stdout_log_level: can be INFO, WARNING, ERROR
         """
-        try:
+        self._logger = Log("PipelineSelector: ",
+                           stdout_log_level=stdout_log_level)
 
-            self._logger = Log("PipelineSelector: ",
-                               stdout_log_level=stdout_log_level)
-
-            input_errors = [
-                    (cost_func, Callable,
-                     "Parameter 'cost_func' must be a Callable"),
-                    (greater_is_better, bool,
-                     "Parameter 'greater_is_better' must be bool type"),
-                    (trials_path, str,
-                     "Parameter 'trials_path' must be of string type"),
-                    (cross_val_averaging_func, (Callable, None.__class__),
-                     ("Parameter 'cross_val_averaging_func'"
-                      "must be a Callable")),
-                    (backup_trials_freq, (int, None.__class__),
-                     "Parameter backup_trials_freq must be an int"),
-                    (additional_metrics, (dict, None.__class__),
-                     "Parameter additional_metrics must be a dict"),
-                    (strategy_name, (str, None.__class__),
-                     "Parameter strategy_name must be a str"),
-                    (stdout_log_level, str,
-                     "Parameter stdout_log_level must be a str")]
-
-            for p, t, err in input_errors:
-                assert((isinstance(p, t))), err
-
-            assert((additional_metrics is None) or
-                   all([isinstance(metric, Callable)
-                        for metric in additional_metrics.values()])),\
-                "Metrics in additional_metrics must be Callables"
+        try:
 
             ExceptionsHandler(self._logger)\
                 .assert_is_directory(path=trials_path)
@@ -143,18 +136,14 @@ class PipelineSelector(ABC):
             self._score_factor = (not greater_is_better) - greater_is_better
             self.trials_path = trials_path
             self._backup_trials_freq = backup_trials_freq
-            self._cross_val_averaging_func = cross_val_averaging_func\
-                or np.mean
-            self._additional_metrics = additional_metrics or {}
             self._strategy_name = strategy_name
             self._data_path = None
             self._cv_path = None
 
-            # best_score can be also read from trials
-            # but is kept explicitely in order not to
-            # search through the trials object every time
-            # loss is the opposite of score
-            self.best_score = np.nan
+            self._X = None
+            self._y = None
+            self._cv = None
+            self._space = None
 
             # if cross-valition is not configured,
             # sklearn cross-validation method is taken by default
@@ -164,23 +153,17 @@ class PipelineSelector(ABC):
             # it is loaded and the search is continued. Else,
             # the search is started from the beginning.
             if os.path.isfile(self.trials_path):
-                try:
-                    with open(self.trials_path, "rb") as f:
-                        self._trials = pickle.load(f)
 
-                    self._start_iteration = self.number_of_trials
+                with open(self.trials_path, "rb") as f:
+                    self._trials = pickle.load(f)
 
-                    self.best_score = self.best_trial_score
+                self._start_iteration = self.number_of_trials
 
-                    self._logger.info(("Loaded an existing trials object"
-                                       "Consisting of {} trials")
-                                      .format(self._start_iteration))
+                self.best_score = self.best_trial_score
 
-                except Exception as e:
-                    err = ("Trials object could not be loaded. "
-                           "Exit with error {}").format(e)
-                    self._logger.log_and_raise_error(err)
-                    self._trials = None
+                self._logger.info(("Loaded an existing trials object"
+                                   "Consisting of {} trials")
+                                  .format(self._start_iteration))
 
             else:
                 self._logger.warning(("No existing trials object was found, "
@@ -188,6 +171,7 @@ class PipelineSelector(ABC):
 
                 self._trials = None
                 self._start_iteration = 0
+                self.best_score = np.nan
 
             # keeping track of the current search iteration
             self._iteration = self._start_iteration
@@ -203,10 +187,9 @@ class PipelineSelector(ABC):
 
             self._logger.log_and_raise_error(err)
 
-    def _backup_trials(self):
+    def _backup_trials(self) -> None:
         '''
-        Pickles (Saves) the trials object.
-        Used in a scheduler.
+        Pickles (Saves) the trials object in binary format.
         '''
         try:
             with open(self.trials_path, "wb") as f:
@@ -218,30 +201,21 @@ class PipelineSelector(ABC):
 
     def configure_cross_validation(self,
                                    cross_validation: Callable,
-                                   kwargs: dict = None):
+                                   kwargs: dict = None) -> None:
         """
         Method for attaching a custom cross-validation function
+
         :param cross_validation: a function that has the same
              signature as sklearn.model_selection.cross_validate
         """
         try:
-            assert(isinstance(cross_validation, Callable)),\
-                "Parameter cross_validation must be a function"
-
             kwargs = kwargs or {}
 
-            assert(isinstance(kwargs, dict)),\
-                "Paramter kwargs must be a dict"
-
             self._cross_validation = functools.partial(
                     self._cross_validation, **kwargs)
 
             self.configured_cross_validation = True
 
-            if hasattr(cross_validation, "__name__"):
-                self.best_result["cross_validation"] =\
-                    cross_validation.__name__
-
             self._logger.info("Configured cross validation")
 
         except Exception as e:
@@ -252,8 +226,12 @@ class PipelineSelector(ABC):
 
     def configure_cross_validation_from_module(self,
                                                module_path: str,
-                                               name: str):
+                                               name: str) -> None:
         """
+        Attaches a cross-validation funciton defined in
+        a different python model. This function must have
+        the same signature as sklearn.model_seclection.cross_validate
+
         :param str module_path: path to python module
             where the cross_validation function is defined.
 
@@ -261,18 +239,12 @@ class PipelineSelector(ABC):
             loaded froma python module.
         """
         try:
-            assert(isinstance(module_path, str) and
-                   isinstance(name, str)),\
-                   "Parameters module_path and name must be of str type"
-
             self._cross_validation = \
                 LoadingUtils().load_from_module(
                         module_path=module_path, name=name)
 
             self.configured_cross_validation = True
 
-            self.best_result["cross_validation"] = name
-
             self._logger.info("Configured cross validation")
 
         except Exception as e:
@@ -281,8 +253,11 @@ class PipelineSelector(ABC):
 
             self._logger.log_and_raise_error(err)
 
-    def attach_space(self, space):
+    def attach_space(self, space) -> None:
         """
+        Method for attaching the pipeline/hyperparameter space
+        over which the score_func is optimized.
+
         :param space: space where
             the search is performed. A space might be either
             a list of dictionaries or a hyperopt space object
@@ -291,17 +266,21 @@ class PipelineSelector(ABC):
         """
         try:
             self._space = space
-            self._logger.info("Attached parameter distribution space")
+
             self.attached_space = True
 
+            self._logger.info("Attached parameter distribution space")
+
         except Exception as e:
             err = ("Failed to attach space. "
                    "Exit with error: {}".format(e))
 
             self._logger.log_and_raise_error(err)
 
-    def attach_space_from_module(self, module_path: str, name: str):
+    def attach_space_from_module(self, module_path: str, name: str) -> None:
         """
+        Attaches a space defined in a different python module.
+
         :param str module_path: path to python module
             where the space is defined.
 
@@ -309,34 +288,34 @@ class PipelineSelector(ABC):
             a python module.
         """
         try:
-            assert(isinstance(module_path, str) and
-                   isinstance(name, str)),\
-                   "Parameters module_path and name must be of str type"
-
             self._space = LoadingUtils().load_from_module(
                     module_path=module_path, name=name)
 
-            self._logger.info("Attached parameter distribution space")
-
             self.attached_space = True
 
+            self._logger.info("Attached parameter distribution space")
+
         except Exception as e:
             err = ("Failed to attach space from module. "
                    "Exit with error {}".format(e))
 
             self._logger.loger_and_raise_error(err)
 
-    def attach_data(self, X_train: (pd.DataFrame, np.ndarray),
-                    y_train: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    X_val: (pd.DataFrame, np.ndarray) = None,
-                    y_val: (pd.DataFrame, pd.Series, np.ndarray) = None,
-                    cv: (list, int) = None):
+    def attach_data(self, X_train: Union[pd.DataFrame, np.ndarray],
+                    y_train: Optional[pd.DataFrame, pd.Series, np.ndarray]
+                    = None,
+                    X_val: Optional[pd.DataFrame, np.ndarray]
+                    = None,
+                    y_val: Optional[pd.DataFrame, pd.Series, np.ndarray]
+                    = None,
+                    cv: Optional[Iterable[Tuple[List[int], List[int]]]]
+                    = None) -> None:
         '''
         :param array X_train: data on which
             machine learning pipelines are trained
 
         :param array y_train: optional, vector with targets,
-            (not all algorithms require a targets)
+            (None in case of unsupervided learning)
 
         :param array X_val: optional, validation data.
             When not provided, cross-validated value
@@ -344,53 +323,49 @@ class PipelineSelector(ABC):
 
         :param array y_val: optional, validation targets
 
-        :param list cv: list of tuples containing
+        :param list cv: iterabe of tuples containing
             train and validation indices or an integer representing
             the number of folds for a random split of data
             during cross-validation
             example: [([0,1,2], [3,4]), ([1,2,3], [4,5])]
         '''
         try:
-            NoneType = None.__class__
-
-            input_err = "Non-valid combination of train and val data types"
+            assert((cv is None) == (X_val is not None)),\
+                "Either cv or X_val must be provided"
 
             if cv is None:
-                assert(isinstance(X_train, (pd.DataFrame, np.ndarray)) and
-                       isinstance(X_val, (pd.DataFrame, np.ndarray)) and
-                       isinstance(y_train, (pd.Series, np.ndarray,
-                                            pd.DataFrame, NoneType)) and
-                       isinstance(y_val, (pd.Series, np.ndarray)) and
-                       (y_val is None) == (y_train is None)), input_err
-
-                # cost is evaluated with a cross validation function
-                # that accepts an array and a cv object with
-                # indices of the fold splits.
+
+                assert((y_val is None) == (y_train is None)),\
+                    "y_train and y_val must be simultanious"
+
                 # Here we create a trivial cv object
                 # with one validation split.
+                cv = CVComposer.dummy_cv()
+
+
+
+
 
                 train_inds = list(range(len(X_train)))
                 val_inds = list(range(len(X_train),
                                       len(X_train) + len(X_val)))
 
                 self._cv = [(train_inds, val_inds)]
+
                 self._X = np.concatenate([X_train, X_val])
                 self._y = None if y_train is None\
                     else np.concatenate([y_train, y_val])
 
             else:
-                assert(isinstance(X_train, (pd.DataFrame, np.ndarray)) and
-                       isinstance(y_train, (pd.Series, np.ndarray,
-                                            pd.DataFrame, NoneType)) and
-                       (X_val is None) and (y_val is None)), input_err
 
                 self._cv = cv
                 self._X = X_train
                 self._y = y_train
 
-            self._logger.info("Attached data")
             self.attached_data = True
 
+            self._logger.info("Attached data")
+
         except Exception as e:
             err = ("Failed to attach data. "
                    "Exit with error: {}".format(e))
@@ -399,17 +374,23 @@ class PipelineSelector(ABC):
 
     def attach_data_from_hdf5(self,
                               data_hdf5_store_path: str,
-                              cv_pickle_path: str = None):
-        """
-        Method for attaching data from a hdf5 store.
-             The hdf5 store is a binary file,
-             after loading it, it is a dictionary with keys
-             X_train (y_train, X_val, y_val). The cv is loaded
-             from a pickle file. The reason to separate the data
-             store from the cv store, is the hdf5 is optimized to
-             store large dataframes (especially with simple types) and
-             a a small list of lists like a cv-object is better
-             to be stored as a pickle file.
+                              cv_pickle_path: str = None) -> None:
+        """
+        Method for attaching data from a hdf5 store
+         and a cv object from a pickled file.
+
+         The hdf5 store is a binary file,
+         after loading it, it is a dictionary with keys
+         X_train (y_train, X_val, y_val).
+
+         The cv is loaded from a pickle file.
+
+         The reason to separate the data
+         store from the cv store, is the hdf5 is optimized to
+         store large dataframes (especially with simple types) and
+         a a small list of lists like a cv-object is better
+         to be stored as a pickle file.
+
         :param str data_hdf5_store_path: path to the hdf5 store
             with train and validation data
         :param str cv_pickle_path: path to the pickle file with
@@ -423,19 +404,16 @@ class PipelineSelector(ABC):
 
             self._data_path = data_hdf5_store_path
 
-            data_input = {}
-
-            for key in ["/X_train", "/y_train", "/X_val", "/y_val"]:
-                if key not in store.keys():
-                    data_input[key.replace("/", "")] = None
-                else:
-                    data_input[key.replace("/", "")] = store[key]
+            data_input = {key: store["key"] if key in store else None
+                          for key in ["X_train", "y_train", "X_val", "y_val"]}
 
             if cv_pickle_path is not None:
+
                 assert(os.path.isfile(cv_pickle_path)),\
                     "Parameter cv_pickle_path is not a file"
 
                 data_input["cv"] = pickle.load(open(cv_pickle_path, "rb"))
+
                 self._cv_path = cv_pickle_path
 
             else:
@@ -449,21 +427,60 @@ class PipelineSelector(ABC):
             err = "Failed to attach data. Exit with error: {}".format(e)
             self._logger.log_and_raise_error(err)
 
+    @property
+    def default_summary(self) -> dict:
+        """
+        Default summary of the strategy.
+        Every the _objective function is called
+        the current score and the information
+        about the tested space element is added to the
+        summary and it is saved to the Trials.
+        If summary saving is configured it is also
+        saved to a file, or a database when the score improves.
+        """
+        summary = {}
+
+        if self._strategy_name is not None:
+            summary["strategy_name"] = self._strategy_name
+
+        if isinstance(self._cost_func, str):
+            summary["cost_func"] = self._cost_func
+
+        elif hasattr(self._cost_func, "__name__"):
+            summary["cost_func"] = self._cost_func.__name__
+
+        summary["trials_path"] = self.trials_path
+
+        if self._data_path is not None:
+            summary["data_path"] = self._data_path
+
+        if self._cv_path is not None:
+            summary["cv_path"] = self._cv_path
+
+        summary["start_tuning_time"] = self.start_tuning_time
+
+        summary["iteration"] = self._iteration
+
+        return summary
+
     def configer_summary_saving(self,
-                                save_method: Callable = None,
-                                kwargs: dict = None):
-        """
-        Attaching a method for saving information about
-             the trials/space/strategy and the result of
-             the current best pipeline. This method can
-             save the result in a txt or a json file,
-             or in a database for example. Arguments like
-             file path or the table name can be specified in kwargs.
+                                save_method: Callable
+                                = functools.partial(
+                                        pd.DataFrame.to_excel,
+                                        **{"path_or_buf": "result.csv"}),
+                                kwargs: Optional[dict] = None) -> None:
+        """
+        When the score calculated by _objective function improves,
+        the default summary is updated with information about the
+        current score and pipeline/hyperparameters
+        and can be saved to a file or database, depending
+        on the configured save_method.
+
         :param Callable save_method: method for saving the result
             of the pipeline selection. The method must accept
-            a pandas DataFrame as argument. See self._save_result
-            method for the format of the argument being saved.
-            By default, saving to a csv file.
+            a pandas DataFrame as argument.
+            By default, saving to an excel file.
+
             Examples:
                 functools.partial(pd.DataFrame.to_csv,
                                   **{"path_or_buf": <PATH>})
@@ -476,13 +493,11 @@ class PipelineSelector(ABC):
                                   **{"collection_name": <NAME>})
 
             using functools can be avoided by providing the kwarg argument
+
         :param dict kwargs: a dictionary with keyword arguments
             (like tablename) to provide to the save_method
         """
         try:
-            save_method = save_method or functools.partial(
-                    pd.DataFrame.to_excel, **{"path_or_buf": "result.csv"})
-
             kwargs = kwargs or {}
 
             self._save_method = functools.partial(save_method, **kwargs)
@@ -494,10 +509,16 @@ class PipelineSelector(ABC):
         except Exception as e:
             err = ("Failed to configure the summary saving. "
                    "Exit with error {}".format(e))
+
             self._logger.log_and_raise_error(err)
 
-    def _save_summary(self, summary: dict):
+    def _save_summary(self, summary: dict) -> None:
         """
+        When the score calculated by _objective function improves,
+        the default summary is updated with information about the
+        current score and pipeline/hyperparameters
+        and can be saved to a file or database, depending
+        on the configured save_method.
         """
         try:
             assert(self.configured_summary_saving),\
@@ -511,29 +532,40 @@ class PipelineSelector(ABC):
 
             self._logger.log_and_raise_error(err)
 
-    def _evaluate(self, pipeline: Pipeline,
-                  scoring: Callable = None,
-                  cross_validation: Callable = None) -> dict:
+    def _evaluate(self, pipeline: Pipeline) -> Union[Dict[str, float], None]:
         """
-        This method is called in _objective.
+        Calculates the averaged cross-validated score and score variance,
+        as well as the averaged values and variances of the additional metrics.
+
+        This method is called in the _objective function that is
+        passed to the hyperopt optimizer.
 
-        Calculates the cost on the attached data.
         This function can be overriden, when the cost
         needs to be calculated differently,
         for example with a tensorflow model.
 
         :param Pipeline pipeline: machine learning pipeline
             that will be evaluated with cross-validation
-        :param cross_validation: a function that has the same
-             signature as sklearn.model_selection.cross_validate
 
         :return: dictionary with the aggregated
-            cross-validation score and
-            the score variance.
+            cross-validation scores and
+            the score variances for the scores in the output
+            of the cross-validation function.
+
+            form of the output:
+                {"score": 10, #score used in optimization,
+                 "score_variance": 0.5
+                 "additional_metric1": 5,
+                 "additional_metric1_variance": 7}
+
+            a custom cross-validation function can also include for
+            example probability threshold for each fold, then
+            the output of this function will include the average
+            value and the variance of the probability threshold
+            over the folds.
         """
         try:
-
-            scoring = {"score": make_scorer(self._cost_func)}
+            scoring = {"score": make_scorer(self.cost_func)}
 
             scoring.update({metric_name: make_scorer(metric)
                             for metric_name, metric
@@ -543,13 +575,19 @@ class PipelineSelector(ABC):
                     estimator=pipeline,
                     X=self._X,
                     y=self._y,
-                    cv=self._cv or 5,
-                    scoring=scoring,
+                    cv=self._cv,
+                    scoring=self._scoring,
                     error_score=np.nan)
 
+            averaging_funcs = {
+                    metric_name: self._additional_averaging_funcs[metric_name]
+                    if metric_name in self._additional_averaging_funcs
+                    else self._cross_val_averaging_func
+                    for metric_name in scores}
+
             scores_average = {
                     metric_name.replace("test_", ""):
-                    self._cross_val_averaging_func(scores[metric_name])
+                    averaging_funcs[metric_name](scores[metric_name])
                     for metric_name in scores
                     if metric_name.startswith("test")}
 
@@ -563,12 +601,13 @@ class PipelineSelector(ABC):
 
         except Exception as e:
             err = "Failed to evaluate pipeline. Exit with error: {}".format(e)
+
             self._logger.log_and_raise_error(err)
 
-    def _objective(self, space_element: dict) -> dict:
+    def _objective(self, space_element: SpaceElementType) -> dict:
         '''
-        This method is called in search_for_best_pipeline
-        inside the hyperopt fmin method.
+        This method is called in run_trials method
+        that is using the hyperopt fmin opmizer.
 
         Uses _evaluate method.
 
@@ -581,12 +620,10 @@ class PipelineSelector(ABC):
 
         :Warning: fmin minimizes the loss,
         when _evaluate returns a value to be maximized,
-        it should be multiplied by -1 to obtain loss.
+        it is multiplied by -1 to obtain loss.
 
-        :param dict space_element: must contain keys
-            name (with the name of the pipeline),
-            pipeline (Pipeline object),
-            params (dict of pipeline params)
+        :param SpaceElementType space_element: element
+            of the space over which the optimization is done
 
         :output: dictionary with keys
             loss (minimized value),
@@ -596,18 +633,9 @@ class PipelineSelector(ABC):
             score_variance,
             timestamp (end of execution),
             train_time: execution time
+            and other keys given in self.default_summary
         '''
         try:
-            assert(isinstance(space_element, dict) and
-                   set(['name', 'pipeline', 'params'])
-                   <= space_element.keys()),\
-                 "Space elements are of wrong form"
-
-            assert(isinstance(space_element['name'], str) and
-                   isinstance(space_element['pipeline'], Pipeline) and
-                   isinstance(space_element['params'], dict)),\
-                "Space elements are of wrong form"
-
             start_time = time.time()
 
             assert(self.attached_data),\
@@ -615,32 +643,14 @@ class PipelineSelector(ABC):
                  "in order to effectuate the best"
                  "pipeline search")
 
-            summary = {}
-
-            if self._strategy_name is not None:
-                summary["strategy_name"] = self._strategy_name
+            summary = deepcopy(self.default_summary)
 
-            if isinstance(self._cost_func, str):
-                summary["cost_func"] = self._cost_func
-
-            elif hasattr(self._cost_func, "__name__"):
-                summary["cost_func"] = self._cost_func.__name__
-
-            summary["trials_path"] = self.trials_path
-
-            if self._data_path is not None:
-                summary["data_path"] = self._data_path
-
-            if self._cv_path is not None:
-                summary["cv_path"] = self._cv_path
-
-            summary["start_tuning_time"] = self.start_tuning_time
-
-            summary["iteration"] = self._iteration
-
-            backup_cond = (self._backup_trials_freq is not None) and\
-                ((self._iteration - self._start_iteration - 1) %
-                 self._backup_trials_freq == 0) or\
+            # backup the current trials if the score improved
+            # at previous iteration or every ith iteration
+            # if the backup_trials_freq is set
+            backup_cond = ((self._backup_trials_freq is not None) and
+                           ((self._iteration - self._start_iteration - 1) %
+                            self._backup_trials_freq == 0)) or\
                 self._score_improved
 
             if backup_cond:
@@ -666,9 +676,6 @@ class PipelineSelector(ABC):
 
             end_time = time.time()
 
-            assert(not np.isnan(result["score"])),\
-                "Score value is not in the output of the _evaluate method"
-
             summary['status'] = STATUS_OK
             summary.update(result)
             summary['loss'] = self._score_factor * summary['score']
@@ -695,6 +702,7 @@ class PipelineSelector(ABC):
 
             self._logger.warning("Trial failed with error {}".format(e))
 
+            summary = {}
             summary['status'] = STATUS_FAIL
             summary['timestamp'] = datetime.datetime.today()
             summary['error'] = e
@@ -725,11 +733,10 @@ class PipelineSelector(ABC):
     def best_trial(self) -> dict:
         """
         Best trial sor far.
-         Should contain the best pipeline,
-         best hyperparameters,
-         as well as an output of the self._objective method,
-         but the exact form of the output depends on the implementation
-         of the Trials object.
+         Should contain the status, pipeline,
+         hyperparameters, and the score (loss).
+         Other information is otional and is defined
+         by self.default_summary
         """
         pass
 
@@ -743,6 +750,7 @@ class PipelineSelector(ABC):
     @abstractproperty
     def best_trial_score_variance(self) -> float:
         """
+        Variance of the cross-validation score of the best pipeline
         """
         pass
 
@@ -771,12 +779,11 @@ class PipelineSelector(ABC):
         pass
 
     @abstractmethod
-    def trials_to_excel(self, path: str):
+    def trials_to_excel(self, path: str) -> None:
         """
         Trials object in the shape of table written to excel,
         should contain the iteration, pipeline (as str),
         hyperparamters (as str), self.best_result (see self._objective method)
-        as well as additional information configured
-        through self.save_result method.
+        as well as additional information defined by self.default_summary
         """
         pass

+ 21 - 10
cdplib/utils/CleaningUtils.py

@@ -8,13 +8,16 @@ Created on Fri Sep 27 16:20:03 2019
 
 import pandas as pd
 import numpy as np
+from typing import Union, Any, List
 
 
 class CleaningUtils:
     '''
     Unites different methods for data cleaning
     '''
-    def convert_dates(series: pd.Series, formats: (str, list)) -> pd.Series:
+    def convert_dates(self,
+                      series: pd.Series[Any],
+                      formats: Union[str, List[str]]) -> pd.Series:
         '''
         Converts values from string to date in a pandas Series
          where possibly multiple date formats are mixed
@@ -29,8 +32,7 @@ class CleaningUtils:
 
                 series = series.astype(str)
 
-                series.loc[missing_leading_zero] = "0" +\
-                    series.loc[missing_leading_zero]
+                series.loc[missing_leading_zero] += "0"
 
             converted_this_format = pd.to_datetime(series,
                                                    format=formt,
@@ -71,21 +73,28 @@ class CleaningUtils:
 
         return s
 
-    def melt_duplicated_columns(self, df: pd.DataFrame, suffix: str = "", prefix: str = "") -> pd.DataFrame:
+    def melt_duplicated_columns(self, df: pd.DataFrame,
+                                suffix: str = "",
+                                prefix: str = "") -> pd.DataFrame:
         '''
         If a dataframe has multiple columns with the same name
          (up to a prefix or a suffix),
          melts the columns together in one
 
-        :parame suffix: string or regex up to which we consider names as duplicated
-        :parame prefix: string or regex up to which we consider names as duplicated
+        :parame suffix: string or regex up
+            to which we consider names as duplicated
+        :parame prefix: string or rege
+            up to which we consider names as duplicated
         '''
         from collections import Counter
 
         import re
 
-        # remove the suffix and the prefix from the column names (now the duplicates are truely duplicates)
-        df.columns = [re.sub(re.compile(prefix), "", re.sub(re.compile(suffix), "", c)) for c in df.columns]
+        # remove the suffix and the prefix from the column names
+        # (now the duplicates are truely duplicates)
+        df.columns = [re.sub(re.compile(prefix), "",
+                             re.sub(re.compile(suffix), "", c))
+                      for c in df.columns]
 
         column_counter = Counter(df.columns)
 
@@ -100,10 +109,12 @@ class CleaningUtils:
             df_melted = []
 
             for dup_var in dup_vars:
-                dup_var_melted = pd.melt(frame=df, id_vars=id_vars, value_vars=[dup_var], value_name=dup_var)\
+                dup_var_melted = pd.melt(frame=df,
+                                         id_vars=id_vars,
+                                         value_vars=[dup_var],
+                                         value_name=dup_var)\
                                    .set_index(id_vars)[dup_var]
 
                 df_melted.append(dup_var_melted)
 
             return pd.concat(df_melted, axis=1, sort=False).reset_index()
-

+ 36 - 0
cdplib/utils/TypeConverter.py

@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Apr 24 09:06:13 2020
+
+@author: tanya
+"""
+
+import numpy as np
+import pandas as pd
+
+class TypeConverter:
+    """
+    Library for methods to manage python types
+    """
+    def __init__(self):
+        """
+        """
+        from cdplib.log import Log
+
+        self._logger = Log("TypeConverter")
+
+    def convert_to_ndarray(self, x: (pd.DataFrame, np.ndarray)) -> np.ndarray:
+        '''
+        Converts an DataFrame to an numpy array.
+        '''
+        if isinstance(x, np.ndarray):
+            return x
+
+        elif (isinstance(x, pd.core.frame.DataFrame))\
+                or (isinstance(x, pd.core.series.Series)):
+            return x.values
+
+        else:
+            self._logger.log_and_raise_error_stack_info(
+                    'The argument must be a numpy array or a pandas DataFrame')