5 years ago · 094f32f2fb
--- a/Pipfile
+++ b/Pipfile
@@ -7,33 +7,15 @@ verify_ssl = true
 
																 [packages]
															
 
																 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
															
 
																-pycodestyle = "*"
															
 
																-ipykernel = "*"
															
 
																-spyder-kernels = "==0.*"
															
 
																-cloudpickle = "*"
															
 
																-openpyxl = "*"
															
 
																-setuptools = "*"
															
 
																-scipy = "*"
															
 
																-matplotlib = "*"
															
 
																-tsfresh = "*"
															
 
																-hyperopt = "*"
															
 
																-xgboost = "*"
															
 
																-scikit-learn = "*"
															
 
																 pandas = "!=0.24.0"
															
 
																-pandas-compat = "*"
															
 
																-xmltodict = "*"
															
 
																 sqlalchemy = "*"
															
 
																 sqlparse = "*"
															
 
																 pymysql = "*"
															
 
																-xlrd = "*"
															
 
																 pymongo = "*"
															
 
																 jsonref = "*"
															
 
																-faker = "*"
															
 
																-xeger = "*"
															
 
																 simplejson = "*"
															
 
																 mysql = "*"
															
 
																-sqlalchemy-utils = "*"
															
 
																-apyori==1.1.1
															
 
																+hyperopt = "*"
															
 
																 [requires]
															
 
																 python_version = "3"
															
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/cdplib/db_handlers/MongodbHandler.py
+++ b/cdplib/db_handlers/MongodbHandler.py
@@ -13,11 +13,13 @@ Created on Mon Sep 16 13:27:44 2019
 
																 import simplejson
															
 
																 import sys
															
 
																 import os
															
 
																+import time
															
 
																 import pymongo
															
 
																 from pymongo import MongoClient
															
 
																 import pandas as pd
															
 
																 import numpy as np
															
 
																+from pprint import pprint
															
 
																 sys.path.append(os.getcwd())
															
 
																 from cdplib.log import Log
															
@@ -171,7 +173,7 @@ class MongodbHandler:
 
																         command = {
															
 
																                     'collMod': collection_name,
															
 
																                     'validator': {
															
 
																-                        '$jsonSchema': parse_obj.schemas[0]
															
 
																+                        '$jsonSchema': parse_obj.load_and_parse_schema_for_mongodb(schema_path)
															
 
																                     },
															
 
																                     'validationLevel': validation_level,
															
 
																                     'validationAction': validation_action
															
@@ -239,7 +241,22 @@ class MongodbHandler:
 
																                 self._database[collection_name].insert_many(data, ordered=ordered)
															
 
																         except Exception as error:
															
 
																-            self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
															
 
																+            if len(data) > 1:
															
 
																+
															
 
																+                self._log.warning(('An error occured inserting {} documents into database: {} and collection: {}.').format(len(data), self._database_name, collection_name))
															
 
																+                self._log.warning('This might be because one or more documents are invalid.') 
															
 
																+                self._log.warning('We will try to insert the documents one-by-one and report which are invalid.')
															
 
																+                self._log.warning(('Error: {}').format(error))
															
 
																+                
															
 
																+                for row in data:
															
 
																+
															
 
																+                    try:
															
 
																+                        self._database[collection_name].insert_one(row)
															
 
																+                    except Exception as error:
															
 
																+                        pprint(row)
															
 
																+                        self._log.warning(error)
															
 
																+            else:
															
 
																+                self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
															
 
																         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
															
@@ -267,16 +284,22 @@ class MongodbHandler:
 
																         try:
															
 
																             if attribute == None or attribute_value == None:
															
 
																-                data = self._database[collection_name].find({},return_values)
															
 
																+                query = {}
															
 
																+                data = self._database[collection_name].find(query,return_values)
															
 
																+                
															
 
																             else:
															
 
																-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, return_values)
															
 
																+                query = {attribute: {comparison_operator: attribute_value}}
															
 
																+                data = self._database[collection_name].find(query, return_values)
															
 
																         except Exception as error:
															
 
																             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
															
 
																-        if return_as_dataframe:
															
 
																-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																-        else:
															
 
																-            return data
															
 
																+            return None
															
 
																+
															
 
																+        if data.collection.count_documents(query) != 0:
															
 
																+            if return_as_dataframe:
															
 
																+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																+            else:
															
 
																+                return data
															
 
																     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
															
@@ -284,11 +307,16 @@ class MongodbHandler:
 
																             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
															
 
																         except Exception as error:
															
 
																             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
															
 
																+            return None
															
 
																         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																-    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None) -> pd.DataFrame():
															
 
																+    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None, chunksize: int = 500) -> pd.DataFrame():
															
 
																+        start_time = time.time()
															
 
																+        '''
															
 
																+        self._log.info('Converting returned mongo data into a DataFrame')
															
 
																+        
															
 
																         data = list(data)
															
 
																         try:
															
 
																             if len(data)> 0:
															
@@ -299,11 +327,38 @@ class MongodbHandler:
 
																                 df = pd.DataFrame(data)
															
 
																                 if index is not None:
															
 
																                     df.set_index(index, inplace=True)
															
 
																+
															
 
																+                self._log.info(('DataFrame conversion is done, took {} seconds').format(time.time()-start_time))
															
 
																                 return df
															
 
																             else:
															
 
																                 self._log.warning(('No data for the query was found').format())
															
 
																         except Exception as error:
															
 
																             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
															
 
																+        '''
															
 
																+    
															
 
																+        frames = []
															
 
																+        records = []
															
 
																+        for iteration, value in enumerate(data):
															
 
																+
															
 
																+            records.append(value)
															
 
																+            if iteration + 1 % chunksize == 0:
															
 
																+                frames.append(pd.DataFrame(records))
															
 
																+                records = []
															
 
																+
															
 
																+        if records:
															
 
																+            frames.append(pd.DataFrame(records))
															
 
																+
															
 
																+        return_df = pd.concat(frames, axis=0, sort=False)
															
 
																+
															
 
																+        if index is not None:
															
 
																+            return_df.set_index(index, inplace=True)
															
 
																+
															
 
																+        self._log.info(('{} Rows were fetched from {}. DataFrame conversion is done, took {} seconds').format(len(return_df.index), collection_name if collection_name is not None else 'the database', time.time()-start_time))
															
 
																+        
															
 
																+        return return_df
															
 
																+
															
 
																+ 
															
 
																+        
															
 
																     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
															
 
																     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
															
@@ -338,7 +393,7 @@ class MongodbHandler:
 
																         '''
															
 
																         return self._database[collection_name].find({query_label:query_value}).count() > 0
															
 
																-    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_as_dataframe: bool = True):
															
 
																+    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_id: bool = False, return_as_dataframe: bool = True):
															
 
																         '''
															
 
																             Queries data between two dates.
															
@@ -349,16 +404,20 @@ class MongodbHandler:
 
																             :param str index:
															
 
																             :param bool return_as_dataframe:
															
 
																         '''
															
 
																+        assert(isinstance(collection_name, str)),\
															
 
																+            "Parameter 'collection_name' must be a string type"
															
 
																         try:
															
 
																-            data = self._database[collection_name].find({date_label: {'$gt': from_date_value, '$lt': to_date_value}})
															
 
																+            query = {date_label: {'$gt': from_date_value, '$lt': to_date_value}}
															
 
																+            data = self._database[collection_name].find(query, {'_id': return_id})
															
 
																         except Exception as error:
															
 
																-            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: $gt:{}, $lt:{}. \nError:{}').format(collection_name, date_label, from_date_value, to_date_value, error))
															
 
																+            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}. \nError:{}').format(collection_name, query, error))
															
 
																-        if return_as_dataframe:
															
 
																-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																-        else:
															
 
																-            return data
															
 
																+        if data.collection.count_documents(query) != 0:
															
 
																+            if return_as_dataframe:
															
 
																+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																+            else:
															
 
																+                return data
															
 
																     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
															
@@ -385,21 +444,23 @@ class MongodbHandler:
 
																         try:
															
 
																             if attribute == None or attribute_value == None:
															
 
																-                data = self._database[collection_name].find({},{'_id': return_id}).sort(sort_label, direction).limit(limit)
															
 
																+                query = {}
															
 
																+                data = self._database[collection_name].find(query,{'_id': return_id}).sort(sort_label, direction).limit(limit)
															
 
																             else:
															
 
																-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, {'_id': return_id}).sort(sort_label, direction).limit(limit)
															
 
																-
															
 
																-            if len(list(data)) == 0:
															
 
																-                self._log.warning('No data was found for the query')
															
 
																-                return None
															
 
																+                query = {attribute: {comparison_operator: attribute_value}}
															
 
																+                data = self._database[collection_name].find(query, {'_id': return_id}).sort(sort_label, direction).limit(limit)
															
 
																         except Exception as error:
															
 
																             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
															
 
																-        if return_as_dataframe:
															
 
																-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																+        if data.collection.count_documents(query) != 0:
															
 
																+            if return_as_dataframe:
															
 
																+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
															
 
																+            else:
															
 
																+                return data
															
 
																         else:
															
 
																-            return data
															
 
																+            self._log.warning('No data was found for the query')
															
 
																+            return None
															
 
																     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
															
--- a/cdplib/db_handlers/SQLHandler.py
+++ b/cdplib/db_handlers/SQLHandler.py
@@ -1,5 +1,3 @@
 
																-#!/usr/bin/env python3
															
 
																-# -*- coding: utf-8 -*-
															
 
																 """
															
 
																 Created on Tue Sep 18 16:20:50 2018
															
@@ -211,13 +209,17 @@ class SQLHandler:
 
																         transaction = connection.begin()
															
 
																         errors = []
															
 
																+        results = []
															
 
																         # in the case of multi-query execute each query
															
 
																         for sub_query in sqlparse.split(query):
															
 
																             if len(sub_query) > 0:
															
 
																                 try:
															
 
																-                    connection.execute(sub_query, multi=True)
															
 
																-
															
 
																+                    result = connection.execute(sub_query)
															
 
																+                    if result.returns_rows:
															
 
																+                        data = pd.DataFrame(result.fetchall())
															
 
																+                        data.columns = result.keys()
															
 
																+                        results.append(data)
															
 
																                 except Exception as e:
															
 
																                     errors.append(str(e))
															
@@ -231,6 +233,7 @@ class SQLHandler:
 
																         transaction.commit()
															
 
																         connection.close()
															
 
																+        return results
															
 
																     def execute_query_from_file(self, filename: str):
															
 
																         '''
															
@@ -441,7 +444,7 @@ class SQLHandler:
 
																                                tablename)
															
 
																             data = self.execute(query)
															
 
																-            colnames = data.columns.tolist()
															
 
																+            colnames = data[0].columns.tolist()
															
 
																         return colnames
															
@@ -640,4 +643,4 @@ class SQLHandler:
 
																             self._engine.dispose()
															
 
																         except Exception as e:
															
 
																             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
															
 
																-            raise Exception(e)
															
 
																+            raise Exception(e)
															
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
@@ -51,7 +51,8 @@ class DataFrameToCollection():
 
																     def to_list_of_documents(self, data: pd.DataFrame,
															
 
																                              grp_fields: list,
															
 
																                              schema: dict = None,
															
 
																-                             _final_step: bool = True) -> list:
															
 
																+                             _final_step: bool = True,
															
 
																+                             already_reshaped: list = []) -> list:
															
 
																         '''
															
 
																         Reshapes a pandas dataframe to a list of documents according
															
 
																          to a complex (json) mongodb schema
															
@@ -84,128 +85,135 @@ class DataFrameToCollection():
 
																             if field not in self._unroll_nested_names(data.columns):
															
 
																                 continue
															
 
																-            field_type = schema["properties"][field]["bsonType"]
															
 
																+            if field in already_reshaped:
															
 
																+                reshaped_field = data.groupby(grp_fields, sort=False)[field]\
															
 
																+                                                .apply(self._make_flattened_list_of_distinct)
															
 
																+                reshaped_fields.append(reshaped_field)
															
 
																+            else:
															
 
																+                field_type = schema["properties"][field]["bsonType"]
															
 
																-            # if field has a simple type
															
 
																-            if field_type not in ["array", "object"]:
															
 
																+                # if field has a simple type
															
 
																+                if field_type not in ["array", "object"]:
															
 
																-                grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																+                    grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																-                # check that there is only one possible value of this field
															
 
																-                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
															
 
																+                    # check that there is only one possible value of this field
															
 
																+                    n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
															
 
																-                # n_distinct_valus can be 0 if the column only contains NaN values
															
 
																-                if n_distinct_values > 1:
															
 
																-                    err = "Field {0} is not unique with respect to {1}"\
															
 
																-                          .format(field, grp_fields)
															
 
																+                    # n_distinct_valus can be 0 if the column only contains NaN values
															
 
																+                    if n_distinct_values > 1:
															
 
																+                        err = "Field {0} is not unique with respect to {1}"\
															
 
																+                            .format(field, grp_fields)
															
 
																-                    self._log.error(err)
															
 
																-                    raise Exception(err)
															
 
																+                        self._log.error(err)
															
 
																+                        raise Exception(err)
															
 
																-                if field not in grp_fields:
															
 
																-                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
															
 
																-                else:
															
 
																-                    reshaped_field =\
															
 
																-                        data[grp_fields].drop_duplicates()\
															
 
																-                        .set_index(grp_fields, drop=False)[field]
															
 
																+                    if field not in grp_fields:
															
 
																+                        reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
															
 
																+                    else:
															
 
																+                        reshaped_field =\
															
 
																+                            data[grp_fields].drop_duplicates()\
															
 
																+                            .set_index(grp_fields, drop=False)[field]
															
 
																-                reshaped_fields.append(reshaped_field)
															
 
																+                    reshaped_fields.append(reshaped_field)
															
 
																-            # if field is sub-document (dictionary)
															
 
																-            elif field_type == "object":
															
 
																+                # if field is sub-document (dictionary)
															
 
																+                elif field_type == "object":
															
 
																-                sub_schema = deepcopy(schema["properties"][field])
															
 
																+                    sub_schema = deepcopy(schema["properties"][field])
															
 
																-                # rename sub-schema properties to match with data column names
															
 
																-                sub_schema["properties"] =\
															
 
																-                    {".".join([field, k]): v for k, v
															
 
																-                     in sub_schema["properties"].items()}
															
 
																+                    # rename sub-schema properties to match with data column names
															
 
																+                    sub_schema["properties"] =\
															
 
																+                        {".".join([field, k]): v for k, v
															
 
																+                        in sub_schema["properties"].items()}
															
 
																-                sub_data = self.to_list_of_documents(
															
 
																-                            data=data,
															
 
																-                            schema=sub_schema,
															
 
																-                            grp_fields=grp_fields,
															
 
																-                            _final_step=False)
															
 
																+                    sub_data = self.to_list_of_documents(
															
 
																+                                data=data,
															
 
																+                                schema=sub_schema,
															
 
																+                                grp_fields=grp_fields,
															
 
																+                                _final_step=False,
															
 
																+                                already_reshaped=already_reshaped)
															
 
																-                # Need to be checked since child elements can be empty
															
 
																-                if sub_data is not None:
															
 
																+                    # Need to be checked since child elements can be empty
															
 
																+                    if sub_data is not None:
															
 
																-                    reshaped_field = sub_data.apply(self._make_dict, axis=1)
															
 
																-                    reshaped_field.name = field
															
 
																+                        reshaped_field = sub_data.apply(self._make_dict, axis=1)
															
 
																+                        reshaped_field.name = field
															
 
																-                    reshaped_fields.append(reshaped_field)
															
 
																+                        reshaped_fields.append(reshaped_field)
															
 
																-            # if field is a list of dictionaries
															
 
																-            elif field_type == "array":
															
 
																+                # if field is a list of dictionaries
															
 
																+                elif field_type == "array":
															
 
																-                items_type = schema["properties"][field]["items"]["bsonType"]
															
 
																+                    items_type = schema["properties"][field]["items"]["bsonType"]
															
 
																-                if items_type == "object":
															
 
																-                    array_object = time.time()
															
 
																-                    sub_schema = deepcopy(schema["properties"][field]["items"])
															
 
																+                    if items_type == "object":
															
 
																+                        array_object = time.time()
															
 
																+                        sub_schema = deepcopy(schema["properties"][field]["items"])
															
 
																-                    # rename sub-schema properties to match data column names
															
 
																-                    sub_schema["properties"] =\
															
 
																-                        {".".join([field, k]): v for k, v in
															
 
																-                         sub_schema["properties"].items()}
															
 
																+                        # rename sub-schema properties to match data column names
															
 
																+                        sub_schema["properties"] =\
															
 
																+                            {".".join([field, k]): v for k, v in
															
 
																+                            sub_schema["properties"].items()}
															
 
																-                    # extend grp fields by sub-fields of field simple types
															
 
																-                    sub_grp_fields = [f for f in sub_schema["properties"]
															
 
																-                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
															
 
																-                                      and (f in data.columns)]
															
 
																+                        # extend grp fields by sub-fields of field simple types
															
 
																+                        sub_grp_fields = [f for f in sub_schema["properties"]
															
 
																+                                        if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
															
 
																+                                        and (f in data.columns)]
															
 
																-                    if len(sub_grp_fields) == 0:
															
 
																-                        err = ("One of the sub-keys in a list of documents"
															
 
																-                               " must be of simple type for the field {}"
															
 
																-                               .format(field))
															
 
																+                        if len(sub_grp_fields) == 0:
															
 
																+                            err = ("One of the sub-keys in a list of documents"
															
 
																+                                " must be of simple type for the field {}"
															
 
																+                                .format(field))
															
 
																-                        self._log.error(err)
															
 
																-                        raise Exception(err)
															
 
																+                            self._log.error(err)
															
 
																+                            raise Exception(err)
															
 
																-                    # group and reshape sub-fields with complex types
															
 
																-                    sub_data = self.to_list_of_documents(
															
 
																-                                data=data,
															
 
																-                                schema=sub_schema,
															
 
																-                                grp_fields=grp_fields + sub_grp_fields,
															
 
																-                                _final_step=False)
															
 
																+                        # group and reshape sub-fields with complex types
															
 
																+                        sub_data = self.to_list_of_documents(
															
 
																+                                    data=data,
															
 
																+                                    schema=sub_schema,
															
 
																+                                    grp_fields=grp_fields + sub_grp_fields,
															
 
																+                                    _final_step=False,
															
 
																+                                    already_reshaped=already_reshaped)
															
 
																-                    if sub_data is not None:
															
 
																+                        if sub_data is not None:
															
 
																-                        # gether the results into a list of dictionaries
															
 
																-                        sub_data = sub_data.apply(self._make_dict, axis=1)
															
 
																+                            # gether the results into a list of dictionaries
															
 
																+                            sub_data = sub_data.apply(self._make_dict, axis=1)
															
 
																-                        sub_data.name = field
															
 
																-                        sub_data = sub_data.reset_index(grp_fields)
															
 
																-                        ######################################################
															
 
																-                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
															
 
																-                        reshaped_field =\
															
 
																-                            sub_data.groupby(grp_fields, sort=False)[field]\
															
 
																-                                    .apply(self._make_list_of_distinct)
															
 
																-                        ######################################################
															
 
																-                        reshaped_fields.append(reshaped_field)
															
 
																+                            sub_data.name = field
															
 
																+                            sub_data = sub_data.reset_index(grp_fields)
															
 
																+                            ######################################################
															
 
																+                            ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
															
 
																+                            reshaped_field =\
															
 
																+                                sub_data.groupby(grp_fields, sort=False)[field]\
															
 
																+                                        .apply(self._make_list_of_distinct)
															
 
																+                            ######################################################
															
 
																+                            reshaped_fields.append(reshaped_field)
															
 
																-                # if field is a list of values with simple type
															
 
																-                elif items_type == "array":
															
 
																-                    grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																+                    # if field is a list of values with simple type
															
 
																+                    elif items_type == "array":
															
 
																+                        grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																-                    if field in data.columns:
															
 
																+                        if field in data.columns:
															
 
																-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
															
 
																-                                             .apply(self._make_list_of_distinct)
															
 
																+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
															
 
																+                                                .apply(self._make_list_of_distinct)
															
 
																-                        reshaped_fields.append(reshaped_field)
															
 
																-                else:
															
 
																+                            reshaped_fields.append(reshaped_field)
															
 
																+                    else:
															
 
																-                    grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																+                        grp_fields = [c for c in grp_fields if c in data.columns]
															
 
																-                    if field in data.columns:
															
 
																+                        if field in data.columns:
															
 
																-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
															
 
																-                                             .apply(self._make_flattened_list_of_distinct)
															
 
																+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
															
 
																+                                                .apply(self._make_flattened_list_of_distinct)
															
 
																-                        reshaped_fields.append(reshaped_field)
															
 
																+                            reshaped_fields.append(reshaped_field)
															
 
																         if len(reshaped_fields) > 0:
															
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -358,11 +358,11 @@ class MigrationCleaning:
 
																                 elif python_type == bool:
															
 
																                     data[column] = data[column].str.lower()
															
 
																-                    accepted_bool = {'ja': True, 'j': True, '1': True,
															
 
																+                    accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
															
 
																                                      'yes': True, 'y': True, 'true':True,
															
 
																                                      't': True, 'nein': False, 'n': False,
															
 
																                                      'no': False, 'false': False, 'f': False,
															
 
																-                                     '0': False}
															
 
																+                                     '0': False, 0:False}
															
 
																                     data[column] = data[column].map(accepted_bool)
															
 
																                     data[column] = data[column].astype(bool)
															
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
@@ -51,15 +51,17 @@ class ParseJsonSchema(ParseDbSchema):
 
																                 with open(schema_path, "r") as f:
															
 
																                     schema = json.load(f)
															
 
																-                ref_flag = self._analyze_schema(schema)
															
 
																-
															
 
																-                if ref_flag:
															
 
																-                    schema = self._format_schema_for_mongo(schema)
															
 
																+                definitions_flag = self._analyze_schema(schema)
															
 
																+                
															
 
																+                if definitions_flag:
															
 
																+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
															
 
																                     schema = self._dereference_schema(schema)
															
 
																-                    schema = self._format_schema_for_mongo(schema)
															
 
																+                    # Need to do it again since sub schema could also contain
															
 
																+                    # single quotes
															
 
																+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
															
 
																                     self.schemas.append(schema)
															
 
																+                    
															
 
																                 else:
															
 
																-                    schema = self._format_schema_for_mongo(schema)
															
 
																                     self.schemas.append(schema)
															
 
																             except Exception as e:
															
@@ -199,7 +201,7 @@ class ParseJsonSchema(ParseDbSchema):
 
																                                  required_only=required_only)
															
 
																         for schema in schemas[1:]:
															
 
																-
															
 
																+            
															
 
																             next_result = self._parse_one(schema=schema,
															
 
																                                           field_info=field_info,
															
 
																                                           required_only=required_only)
															
@@ -340,7 +342,7 @@ class ParseJsonSchema(ParseDbSchema):
 
																         return already_parsed
															
 
																-    def read_schema_and_parse_for_mongodb(self, schema_path: str) -> dict:
															
 
																+    def load_and_parse_schema_for_mongodb(self, schema_path: str) -> dict:
															
 
																         '''
															
 
																         We need to deference json before import to Mongo DB pymongo can't deal with references
															
 
																         :param str schema_path: path to the schema file.
															
@@ -353,9 +355,16 @@ class ParseJsonSchema(ParseDbSchema):
 
																             schema = json.load(json_file)
															
 
																         definitions_flag = self._analyze_schema(schema)
															
 
																-
															
 
																+        
															
 
																         if definitions_flag:
															
 
																+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
															
 
																             schema = self._dereference_schema(schema)
															
 
																+             # Need to do it again since sub schema could also contain
															
 
																+             # single quotes
															
 
																+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
															
 
																+            schema = self._format_schema_for_mongo(schema)
															
 
																+        else:
															
 
																+            schema = self._format_schema_for_mongo(schema)
															
 
																         return schema
															
@@ -371,12 +380,11 @@ class ParseJsonSchema(ParseDbSchema):
 
																                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
															
 
																         return definitions_flag
															
 
																-
															
 
																-    def _format_schema_for_mongo(self, schema: dict) -> dict:
															
 
																+    
															
 
																+    
															
 
																+    def _clean_desciptions_tags_from_single_quotes(self, schema: dict) -> dict:
															
 
																         '''
															
 
																-        We use in the schema tags whih are not supported by mongo an threfore
															
 
																-        must be taken care of before setting the schema for mongo.
															
 
																-        :param str schema_path: path to the schema file.
															
 
																+        :param dict schema: dictonary containing schema
															
 
																         '''
															
 
																         for key in list(schema):
															
@@ -386,14 +394,27 @@ class ParseJsonSchema(ParseDbSchema):
 
																                 schema[key] = cleaned_description
															
 
																             if type(schema[key]) == dict:
															
 
																-                self._format_schema_for_mongo(schema[key])
															
 
																+                self._clean_desciptions_tags_from_single_quotes(schema[key])
															
 
																+                
															
 
																+        return schema
															
 
																-            if key == 'examples':
															
 
																-                self._remove_examples(schema)
															
 
																+    def _format_schema_for_mongo(self, schema: dict) -> dict:
															
 
																+        '''
															
 
																+        We use in the schema tags whih are not supported by mongo an threfore
															
 
																+        must be taken care of before setting the schema for mongo.
															
 
																+        :param str schema_path: path to the schema file.
															
 
																+        '''
															
 
																+        for key in list(schema):
															
 
																+
															
 
																+            if type(schema[key]) == dict:
															
 
																+                self._format_schema_for_mongo(schema[key])
															
 
																             if key == 'default' or key == 'default_values':
															
 
																                 self._remove_defaults(schema)
															
 
																+                
															
 
																+            if key == 'examples':
															
 
																+                self._remove_examples(schema)
															
 
																         return schema
															
@@ -411,7 +432,7 @@ class ParseJsonSchema(ParseDbSchema):
 
																         schema = str(schema).replace("'", "\"")
															
 
																         schema = jsonref.loads(schema, base_uri=base_dir_url)
															
 
																         schema = deepcopy(schema)
															
 
																-        #schema.pop('definitions', None)
															
 
																+
															
 
																         return schema
															
 
																     def _remove_defaults(self, schema: dict) -> dict:
															
--- a/cdplib/db_migration/ParseMapping.py
+++ b/cdplib/db_migration/ParseMapping.py
@@ -22,10 +22,10 @@ class ParseMapping:
 
																         import json
															
 
																         from cdplib.log import Log
															
 
																-        self.log = Log('Parse Mapping')
															
 
																+        self._log = Log('Parse Mapping')
															
 
																         if not os.path.isfile(mapping_path):
															
 
																-            err = "Mapping not found"
															
 
																+            err = "Mapping not found "+mapping_path
															
 
																             self._log.error(err)
															
 
																             raise FileNotFoundError(err)
															
@@ -34,7 +34,7 @@ class ParseMapping:
 
																                 self._mapping = json.load(f)
															
 
																         except Exception as e:
															
 
																-            err = ("Could not load mapping. "
															
 
																+            err = ("Could not load mapping. " + mapping_path +
															
 
																                    "Exit with error {}".format(e))
															
 
																             self._log.error(err)
															
 
																             raise Exception(err)
															
@@ -97,6 +97,42 @@ class ParseMapping:
 
																         '''
															
 
																         '''
															
 
																         return self._get_info(key="date_format")
															
 
																+    
															
 
																+    def get_internal_names(self) -> dict:
															
 
																+        '''
															
 
																+        '''
															
 
																+ 
															
 
																+        if all(["internal_name" in d for d in self._mapping]):
															
 
																+            internal_names = [d["internal_name"] for d in self._mapping]
															
 
																+    
															
 
																+        elif all(["internal_name" not in d for d in self._mapping]):
															
 
																+            internal_names = list(range(len(self._mapping)))
															
 
																+
															
 
																+
															
 
																+        else:
															
 
																+            err = ("Incorrectly filled mapping. Internal names should "
															
 
																+                   "either be in all or in neither of the fields")
															
 
																+            self._log.error(err)
															
 
																+            raise Exception(err)
															
 
																+
															
 
																+        return internal_names
															
 
																+
															
 
																+    def get_mongo_names(self) -> dict:
															
 
																+        '''
															
 
																+        '''
															
 
																+        if all(["mongo_name" in d for d in self._mapping]):
															
 
																+            mongo_names = [d["mongo_name"] for d in self._mapping]
															
 
																+
															
 
																+        elif all(["mongo_name" not in d for d in self._mapping]):
															
 
																+            mongo_names = list(range(len(self._mapping)))
															
 
																+
															
 
																+        else:
															
 
																+            err = ("Incorrectly filled mapping. Mongo names should "
															
 
																+                   "either be in all or in neither of the fields")
															
 
																+            self._log.error(err)
															
 
																+            raise Exception(err)
															
 
																+
															
 
																+        return mongo_names
															
 
																     def get_types(self) -> dict:
															
 
																         '''
															
@@ -134,7 +170,7 @@ class ParseMapping:
 
																         else:
															
 
																             err = ("Incorrectly filled mapping. Column numbers should ",
															
 
																                    "either in all or in neither of the fields")
															
 
																-            self.log.err(err)
															
 
																+            self._log.err(err)
															
 
																             raise Exception(err)
															
 
																         return column_numbers
															
--- a/cdplib/log.py
+++ b/cdplib/log.py
@@ -60,6 +60,44 @@ class Log():
 
																         # self._logger.setLevel(log_level)
															
 
																+
															
 
																+    @property
															
 
																+    def magenta(self):
															
 
																+        return '\033[95m'
															
 
																+
															
 
																+    @property
															
 
																+    def blue(self):
															
 
																+        return '\033[94m'
															
 
																+
															
 
																+    @property
															
 
																+    def cyan(self):
															
 
																+        return '\u001b[36m'
															
 
																+
															
 
																+    @property
															
 
																+    def green(self):
															
 
																+        return '\033[92m'
															
 
																+
															
 
																+    @property
															
 
																+    def yellow(self):
															
 
																+        return '\033[93m'
															
 
																+
															
 
																+    @property
															
 
																+    def fail(self):
															
 
																+        return '\033[91m'
															
 
																+
															
 
																+    @property
															
 
																+    def reset(self):
															
 
																+        return '\033[0m'
															
 
																+
															
 
																+    @property
															
 
																+    def bold(self):
															
 
																+        return '\033[1m'
															
 
																+
															
 
																+    @property
															
 
																+    def underline(self):
															
 
																+        return '\033[4m'
															
 
																+
															
 
																+
															
 
																     def info(self, message: str):
															
 
																         self._logger.info(message)
															
--- a/cdplib/unit_tests/TestFlattenData.py
+++ b/cdplib/unit_tests/TestFlattenData.py
@@ -7,7 +7,7 @@ sys.path.append(os.getcwd())
 
																 from cdplib.log import Log
															
 
																 from cdplib.FlattenData import FlattenData
															
 
																-class TestMongodbHandler(unittest.TestCase):
															
 
																+class TestFlattenData(unittest.TestCase):
															
 
																     def setUp(self):
															
 
																         self.flattener = FlattenData()
															
--- a/cdplib/unit_tests/TestLog.py
+++ b/cdplib/unit_tests/TestLog.py
@@ -0,0 +1,26 @@
 
																+import unittest
															
 
																+import sys
															
 
																+import os
															
 
																+from pprint import pprint
															
 
																+sys.path.append(os.getcwd())
															
 
																+from cdplib.log import Log
															
 
																+
															
 
																+
															
 
																+class TestLog(unittest.TestCase):
															
 
																+
															
 
																+    def setUp(self):
															
 
																+       self._log = Log('Log Test')
															
 
																+
															
 
																+    def test_A_Log_Colors(self):
															
 
																+        self._log.info('Test Starts Here')
															
 
																+        self._log.info(self._log.magenta + "Header")
															
 
																+        self._log.info(self._log.blue + "Blue")
															
 
																+        self._log.info( self._log.green + "Green")
															
 
																+        self._log.info(self._log.yellow + "yellow")
															
 
																+        self._log.info(self._log.fail + "Fail")
															
 
																+        self._log.info(self._log.reset + "reset")
															
 
																+        self._log.info(self._log.bold + "bold" )
															
 
																+        self._log.info(self._log.underline + "underline" )
															
 
																+
															
 
																+if __name__ == '__main__':
															
 
																+    unittest.main()
															
--- a/cdplib/unit_tests/TestMongodbHandler.py
+++ b/cdplib/unit_tests/TestMongodbHandler.py
@@ -1,6 +1,7 @@
 
																 import unittest
															
 
																 import sys
															
 
																 import os
															
 
																+import time
															
 
																 from pymongo import MongoClient
															
 
																 sys.path.append(os.getcwd())
															
 
																 from cdplib.log import Log
															
@@ -25,12 +26,14 @@ class TestMongodbHandler(unittest.TestCase):
 
																         self.valid_input = {
															
 
																                         "test_value_string": "test_value",
															
 
																                         "test_value_double": 2.4,
															
 
																-                        "test_value_double_array": [1.4, 1.6, 3.5]
															
 
																+                        "test_value_double_array": [1.4, 1.6, 3.5],
															
 
																+                        "test_value_date": "2020-01-28T15:45:25.000Z"
															
 
																                         }
															
 
																         self.invalid_input = {
															
 
																                         "test_value_string": 1,
															
 
																                         "test_value_double": "Wrong value",
															
 
																-                        "test_value_double_array": [1.4, 1.6, 3.5]
															
 
																+                        "test_value_double_array": [1.4, 1.6, 3.5],
															
 
																+                        "test_value_date": "2019-01-28T15:45:25.000Z"
															
 
																                         }
															
@@ -81,9 +84,10 @@ class TestMongodbHandler(unittest.TestCase):
 
																         Fetch data and confirms thats it is the same as was entered into the database
															
 
																         Do the same with more specific query
															
 
																         '''
															
 
																+
															
 
																         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
															
 
																         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
															
 
																-    
															
 
																+
															
 
																     def test_F_aggregate_data_and_generate_dataframe(self):
															
 
																         '''
															
 
																         Make an aggregation call
															
@@ -93,7 +97,7 @@ class TestMongodbHandler(unittest.TestCase):
 
																                                 { '$match': {}}
															
 
																                                 ]
															
 
																         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
															
 
																-    
															
 
																+
															
 
																     def test_G_update_data_in_collection(self):
															
 
																         '''
															
 
																         Fetch data from database
															
@@ -104,7 +108,7 @@ class TestMongodbHandler(unittest.TestCase):
 
																         '''
															
 
																         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
															
 
																         self.assertEqual(original_value, self.valid_input['test_value_string'])
															
 
																-        self.mongodb_handler.update_data_in_collection('test_value_string', 'test_value', 'test_value_string', 'new_test_value', self.first_collection_name)
															
 
																+        self.mongodb_handler.update_data_in_collection('test_value_string', 'new_test_value', self.first_collection_name, 'test_value_string', 'test_value', create_if_not_exist=False)
															
 
																         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
															
 
																         self.assertNotEqual(original_value, new_value)
															
@@ -116,6 +120,11 @@ class TestMongodbHandler(unittest.TestCase):
 
																         index = 'test_value_string'
															
 
																         self.mongodb_handler.create_index(self.first_collection_name, index)
															
 
																         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
															
 
																+
															
 
																+    def test_I_query_data_between_dates_and_generate_dataframe(self):
															
 
																+
															
 
																+            data = self.mongodb_handler.query_data_between_dates_and_generate_dataframe(self.first_collection_name, "test_value_date", "2020-01-27T15:45:25.000Z", "2020-01-29T15:45:25.000Z", index ='test_value_string')
															
 
																+            self.assertEqual(data['test_value_double'][0], self.valid_input['test_value_double'])
															
 
																     def test_Y_drop_collection(self):
															
 
																         '''
															
--- a/hooks/README.txt
+++ b/hooks/README.txt
@@ -0,0 +1,13 @@
 
																+These files are GIT HOOKS.
															
 
																+
															
 
																+A git hook is a a script which is executed when a git command is run.
															
 
																+The hook in this folder is executed when commiting (pre-commit).
															
 
																+
															
 
																+pre-commit executes the unit tests before commiting the changes.
															
 
																+
															
 
																+ACHTUNG!
															
 
																+Changes will still be commited and pushed even if there are errors in the test (for now at least).
															
 
																+So please pay attention to the tests an make sure that they ran without any problems. If the test
															
 
																+failed, please fix the issue before pushing the changes!
															
 
																+
															
 
																+To use it please copy the files into your .git/hooks folder.
															
--- a/hooks/pre-commit
+++ b/hooks/pre-commit
@@ -0,0 +1,17 @@
 
																+#!/bin/bash
															
 
																+#
															
 
																+# Runs test pipeline before commiting
															
 
																+#
															
 
																+# To enable this hook, rename this file to "pre-commit".
															
 
																+echo
															
 
																+echo "Running unit tests"
															
 
																+echo
															
 
																+
															
 
																+python cdplib/unit_tests/TestFlattenData.py
															
 
																+python cdplib/unit_tests/TestLog.py
															
 
																+python cdplib/unit_tests/TestMongodbHandler.py
															
 
																+
															
 
																+echo 
															
 
																+echo
															
 
																+echo -e "\033 Unit tests have been run and your data commited, in case any of the tests failed, please correct this before pushing changes"
															
 
																+echo
															
--- a/setup.py
+++ b/setup.py
@@ -1,32 +1,17 @@
 
																 from setuptools import setup,find_packages
															
 
																 INSTALL_REQUIRES = [
															
 
																-        'pycodestyle',
															
 
																-        'ipykernel',
															
 
																-  		'spyder-kernels==0.*',
															
 
																-        'cloudpickle',
															
 
																-        'openpyxl',
															
 
																-        'setuptools',
															
 
																-        'scipy',
															
 
																-        'matplotlib',
															
 
																-        'tsfresh',
															
 
																-        'hyperopt',
															
 
																-        'xgboost',
															
 
																-        'scikit-learn',
															
 
																-  		'pandas',
															
 
																-        'pandas-compat',
															
 
																-        'xmltodict',
															
 
																+        'pandas',
															
 
																         'sqlalchemy',
															
 
																         'sqlparse',
															
 
																         'pymysql',
															
 
																-        'xlrd',
															
 
																         'pymongo',
															
 
																-        'jsonref', 
															
 
																-        'faker',
															
 
																-        'xeger',
															
 
																+        'jsonref',
															
 
																         'simplejson',
															
 
																         'mysql',
															
 
																-        'sqlalchemy-utils',
															
 
																+        'sqlalchemy_utils',
															
 
																+        'sklearn',
															
 
																+        'hyperopt',
															
 
																 ]