5 years ago · 094f32f2fb
--- a/Pipfile
+++ b/Pipfile
@@ -7,33 +7,15 @@ verify_ssl = true
 
				 
			
 
				 [packages]
			
 
				 cdplib = {editable = true,git = "https://readonly:readonly@intra.acdp.at/gogs/tanja/cdplib.git"}
			
 
				-pycodestyle = "*"
			
 
				-ipykernel = "*"
			
 
				-spyder-kernels = "==0.*"
			
 
				-cloudpickle = "*"
			
 
				-openpyxl = "*"
			
 
				-setuptools = "*"
			
 
				-scipy = "*"
			
 
				-matplotlib = "*"
			
 
				-tsfresh = "*"
			
 
				-hyperopt = "*"
			
 
				-xgboost = "*"
			
 
				-scikit-learn = "*"
			
 
				 pandas = "!=0.24.0"
			
 
				-pandas-compat = "*"
			
 
				-xmltodict = "*"
			
 
				 sqlalchemy = "*"
			
 
				 sqlparse = "*"
			
 
				 pymysql = "*"
			
 
				-xlrd = "*"
			
 
				 pymongo = "*"
			
 
				 jsonref = "*"
			
 
				-faker = "*"
			
 
				-xeger = "*"
			
 
				 simplejson = "*"
			
 
				 mysql = "*"
			
 
				-sqlalchemy-utils = "*"
			
 
				-apyori==1.1.1
			
 
				+hyperopt = "*"
			
 
				 
			
 
				 [requires]
			
 
				 python_version = "3"
			
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/cdplib/db_handlers/MongodbHandler.py
+++ b/cdplib/db_handlers/MongodbHandler.py
@@ -13,11 +13,13 @@ Created on Mon Sep 16 13:27:44 2019
 
				 import simplejson
			
 
				 import sys
			
 
				 import os
			
 
				+import time
			
 
				 
			
 
				 import pymongo
			
 
				 from pymongo import MongoClient
			
 
				 import pandas as pd
			
 
				 import numpy as np
			
 
				+from pprint import pprint
			
 
				 
			
 
				 sys.path.append(os.getcwd())
			
 
				 from cdplib.log import Log
			
@@ -171,7 +173,7 @@ class MongodbHandler:
 
				         command = {
			
 
				                     'collMod': collection_name,
			
 
				                     'validator': {
			
 
				-                        '$jsonSchema': parse_obj.schemas[0]
			
 
				+                        '$jsonSchema': parse_obj.load_and_parse_schema_for_mongodb(schema_path)
			
 
				                     },
			
 
				                     'validationLevel': validation_level,
			
 
				                     'validationAction': validation_action
			
@@ -239,7 +241,22 @@ class MongodbHandler:
 
				                 self._database[collection_name].insert_many(data, ordered=ordered)
			
 
				 
			
 
				         except Exception as error:
			
 
				-            self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
			
 
				+            if len(data) > 1:
			
 
				+
			
 
				+                self._log.warning(('An error occured inserting {} documents into database: {} and collection: {}.').format(len(data), self._database_name, collection_name))
			
 
				+                self._log.warning('This might be because one or more documents are invalid.') 
			
 
				+                self._log.warning('We will try to insert the documents one-by-one and report which are invalid.')
			
 
				+                self._log.warning(('Error: {}').format(error))
			
 
				+                
			
 
				+                for row in data:
			
 
				+
			
 
				+                    try:
			
 
				+                        self._database[collection_name].insert_one(row)
			
 
				+                    except Exception as error:
			
 
				+                        pprint(row)
			
 
				+                        self._log.warning(error)
			
 
				+            else:
			
 
				+                self._log.log_and_raise_error(('An error occured when trying to insert data into {}, {}. \nError: {}').format(self._database_name, collection_name, error))
			
 
				 
			
 
				         self._log.info(('Data has been inserted into the {} collection').format(collection_name))
			
 
				 
			
@@ -267,16 +284,22 @@ class MongodbHandler:
 
				 
			
 
				         try:
			
 
				             if attribute == None or attribute_value == None:
			
 
				-                data = self._database[collection_name].find({},return_values)
			
 
				+                query = {}
			
 
				+                data = self._database[collection_name].find(query,return_values)
			
 
				+                
			
 
				             else:
			
 
				-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, return_values)
			
 
				+                query = {attribute: {comparison_operator: attribute_value}}
			
 
				+                data = self._database[collection_name].find(query, return_values)
			
 
				 
			
 
				         except Exception as error:
			
 
				             self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: {}:{}. \nError:{}').format(collection_name, attribute, comparison_operator, attribute_value, error))
			
 
				-        if return_as_dataframe:
			
 
				-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				-        else:
			
 
				-            return data
			
 
				+            return None
			
 
				+
			
 
				+        if data.collection.count_documents(query) != 0:
			
 
				+            if return_as_dataframe:
			
 
				+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				+            else:
			
 
				+                return data
			
 
				 
			
 
				     def aggregate_data_and_generate_dataframe(self, collection_name: str, aggregation_pipeline: list, index: str = None):
			
 
				 
			
@@ -284,11 +307,16 @@ class MongodbHandler:
 
				             data = self._database[collection_name].aggregate(pipeline=aggregation_pipeline, allowDiskUse=True)
			
 
				         except Exception as error:
			
 
				             self._log.log_and_raise_error(('A problem occured when aggregating the collection {} with the pipeline {}. \nError: {}').format(collection_name, aggregation_pipeline, error))
			
 
				+            return None
			
 
				 
			
 
				         return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				 
			
 
				-    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None) -> pd.DataFrame():
			
 
				+    def convert_mongo_data_into_dataframe(self, data, index: str = None, collection_name: str = None, chunksize: int = 500) -> pd.DataFrame():
			
 
				 
			
 
				+        start_time = time.time()
			
 
				+        '''
			
 
				+        self._log.info('Converting returned mongo data into a DataFrame')
			
 
				+        
			
 
				         data = list(data)
			
 
				         try:
			
 
				             if len(data)> 0:
			
@@ -299,11 +327,38 @@ class MongodbHandler:
 
				                 df = pd.DataFrame(data)
			
 
				                 if index is not None:
			
 
				                     df.set_index(index, inplace=True)
			
 
				+
			
 
				+                self._log.info(('DataFrame conversion is done, took {} seconds').format(time.time()-start_time))
			
 
				                 return df
			
 
				             else:
			
 
				                 self._log.warning(('No data for the query was found').format())
			
 
				         except Exception as error:
			
 
				             self._log.log_and_raise_error(('An error occured trying to convert mongo data into pd.Dataframe. \nError: {} ').format(error))
			
 
				+        '''
			
 
				+    
			
 
				+        frames = []
			
 
				+        records = []
			
 
				+        for iteration, value in enumerate(data):
			
 
				+
			
 
				+            records.append(value)
			
 
				+            if iteration + 1 % chunksize == 0:
			
 
				+                frames.append(pd.DataFrame(records))
			
 
				+                records = []
			
 
				+
			
 
				+        if records:
			
 
				+            frames.append(pd.DataFrame(records))
			
 
				+
			
 
				+        return_df = pd.concat(frames, axis=0, sort=False)
			
 
				+
			
 
				+        if index is not None:
			
 
				+            return_df.set_index(index, inplace=True)
			
 
				+
			
 
				+        self._log.info(('{} Rows were fetched from {}. DataFrame conversion is done, took {} seconds').format(len(return_df.index), collection_name if collection_name is not None else 'the database', time.time()-start_time))
			
 
				+        
			
 
				+        return return_df
			
 
				+
			
 
				+ 
			
 
				+        
			
 
				 
			
 
				     #def update_data_in_collection(self, query_label: str, query_value: str, update_label:str, update_value: str, collection_name:str):
			
 
				     #    self._database[collection_name].update_one({query_label:query_value}, {"$set": {update_label: update_value}})
			
@@ -338,7 +393,7 @@ class MongodbHandler:
 
				         '''
			
 
				         return self._database[collection_name].find({query_label:query_value}).count() > 0
			
 
				 
			
 
				-    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_as_dataframe: bool = True):
			
 
				+    def query_data_between_dates_and_generate_dataframe(self, collection_name: str, date_label: str, from_date_value: str, to_date_value: str, index: str = None, return_id: bool = False, return_as_dataframe: bool = True):
			
 
				         '''
			
 
				             Queries data between two dates.
			
 
				 
			
@@ -349,16 +404,20 @@ class MongodbHandler:
 
				             :param str index:
			
 
				             :param bool return_as_dataframe:
			
 
				         '''
			
 
				+        assert(isinstance(collection_name, str)),\
			
 
				+            "Parameter 'collection_name' must be a string type"
			
 
				         try:
			
 
				-            data = self._database[collection_name].find({date_label: {'$gt': from_date_value, '$lt': to_date_value}})
			
 
				+            query = {date_label: {'$gt': from_date_value, '$lt': to_date_value}}
			
 
				+            data = self._database[collection_name].find(query, {'_id': return_id})
			
 
				 
			
 
				         except Exception as error:
			
 
				-            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}: $gt:{}, $lt:{}. \nError:{}').format(collection_name, date_label, from_date_value, to_date_value, error))
			
 
				+            self._log.log_and_raise_error(('An error occured trying to query data from {}, with query {}. \nError:{}').format(collection_name, query, error))
			
 
				 
			
 
				-        if return_as_dataframe:
			
 
				-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				-        else:
			
 
				-            return data
			
 
				+        if data.collection.count_documents(query) != 0:
			
 
				+            if return_as_dataframe:
			
 
				+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				+            else:
			
 
				+                return data
			
 
				 
			
 
				     def query_oldest_or_newest_date_in_collection(self, collection_name: str, date_label: str, oldest: bool = False):
			
 
				 
			
@@ -385,21 +444,23 @@ class MongodbHandler:
 
				         try:
			
 
				 
			
 
				             if attribute == None or attribute_value == None:
			
 
				-                data = self._database[collection_name].find({},{'_id': return_id}).sort(sort_label, direction).limit(limit)
			
 
				+                query = {}
			
 
				+                data = self._database[collection_name].find(query,{'_id': return_id}).sort(sort_label, direction).limit(limit)
			
 
				             else:
			
 
				-                data = self._database[collection_name].find({attribute: {comparison_operator: attribute_value}}, {'_id': return_id}).sort(sort_label, direction).limit(limit)
			
 
				-
			
 
				-            if len(list(data)) == 0:
			
 
				-                self._log.warning('No data was found for the query')
			
 
				-                return None
			
 
				+                query = {attribute: {comparison_operator: attribute_value}}
			
 
				+                data = self._database[collection_name].find(query, {'_id': return_id}).sort(sort_label, direction).limit(limit)
			
 
				 
			
 
				         except Exception as error:
			
 
				             self._log.log_and_raise_error(('An error occured trying to query data from {}, \nError:{}').format(collection_name, error))
			
 
				 
			
 
				-        if return_as_dataframe:
			
 
				-            return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				+        if data.collection.count_documents(query) != 0:
			
 
				+            if return_as_dataframe:
			
 
				+                return self.convert_mongo_data_into_dataframe(data, index, collection_name)
			
 
				+            else:
			
 
				+                return data
			
 
				         else:
			
 
				-            return data
			
 
				+            self._log.warning('No data was found for the query')
			
 
				+            return None
			
 
				 
			
 
				     def update_data_in_collection(self, update_label:str, update_value: str, collection_name:str, query_label: str = None, query_value: str = None, create_if_not_exist: bool = True, find_query: dict = None, update_many: bool = False):
			
 
				 
			
--- a/cdplib/db_handlers/SQLHandler.py
+++ b/cdplib/db_handlers/SQLHandler.py
@@ -1,5 +1,3 @@
 
				-#!/usr/bin/env python3
			
 
				-# -*- coding: utf-8 -*-
			
 
				 """
			
 
				 Created on Tue Sep 18 16:20:50 2018
			
 
				 
			
@@ -211,13 +209,17 @@ class SQLHandler:
 
				         transaction = connection.begin()
			
 
				 
			
 
				         errors = []
			
 
				+        results = []
			
 
				 
			
 
				         # in the case of multi-query execute each query
			
 
				         for sub_query in sqlparse.split(query):
			
 
				             if len(sub_query) > 0:
			
 
				                 try:
			
 
				-                    connection.execute(sub_query, multi=True)
			
 
				-
			
 
				+                    result = connection.execute(sub_query)
			
 
				+                    if result.returns_rows:
			
 
				+                        data = pd.DataFrame(result.fetchall())
			
 
				+                        data.columns = result.keys()
			
 
				+                        results.append(data)
			
 
				                 except Exception as e:
			
 
				                     errors.append(str(e))
			
 
				 
			
@@ -231,6 +233,7 @@ class SQLHandler:
 
				 
			
 
				         transaction.commit()
			
 
				         connection.close()
			
 
				+        return results
			
 
				 
			
 
				     def execute_query_from_file(self, filename: str):
			
 
				         '''
			
@@ -441,7 +444,7 @@ class SQLHandler:
 
				                                tablename)
			
 
				 
			
 
				             data = self.execute(query)
			
 
				-            colnames = data.columns.tolist()
			
 
				+            colnames = data[0].columns.tolist()
			
 
				 
			
 
				         return colnames
			
 
				 
			
@@ -640,4 +643,4 @@ class SQLHandler:
 
				             self._engine.dispose()
			
 
				         except Exception as e:
			
 
				             print(('An error occured when trying to dispose the SQL engine. Error: {}').format(e))
			
 
				-            raise Exception(e)
			
 
				+            raise Exception(e)
			
--- a/cdplib/db_migration/DataFrameToCollection.py
+++ b/cdplib/db_migration/DataFrameToCollection.py
@@ -51,7 +51,8 @@ class DataFrameToCollection():
 
				     def to_list_of_documents(self, data: pd.DataFrame,
			
 
				                              grp_fields: list,
			
 
				                              schema: dict = None,
			
 
				-                             _final_step: bool = True) -> list:
			
 
				+                             _final_step: bool = True,
			
 
				+                             already_reshaped: list = []) -> list:
			
 
				         '''
			
 
				         Reshapes a pandas dataframe to a list of documents according
			
 
				          to a complex (json) mongodb schema
			
@@ -84,128 +85,135 @@ class DataFrameToCollection():
 
				             if field not in self._unroll_nested_names(data.columns):
			
 
				                 continue
			
 
				 
			
 
				-            field_type = schema["properties"][field]["bsonType"]
			
 
				+            if field in already_reshaped:
			
 
				+                reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				+                                                .apply(self._make_flattened_list_of_distinct)
			
 
				+                reshaped_fields.append(reshaped_field)
			
 
				+            else:
			
 
				+                field_type = schema["properties"][field]["bsonType"]
			
 
				 
			
 
				-            # if field has a simple type
			
 
				-            if field_type not in ["array", "object"]:
			
 
				+                # if field has a simple type
			
 
				+                if field_type not in ["array", "object"]:
			
 
				 
			
 
				-                grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				+                    grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				 
			
 
				-                # check that there is only one possible value of this field
			
 
				-                n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
			
 
				+                    # check that there is only one possible value of this field
			
 
				+                    n_distinct_values = data.groupby(grp_fields, sort=False)[field].nunique().max()
			
 
				 
			
 
				-                # n_distinct_valus can be 0 if the column only contains NaN values
			
 
				-                if n_distinct_values > 1:
			
 
				-                    err = "Field {0} is not unique with respect to {1}"\
			
 
				-                          .format(field, grp_fields)
			
 
				+                    # n_distinct_valus can be 0 if the column only contains NaN values
			
 
				+                    if n_distinct_values > 1:
			
 
				+                        err = "Field {0} is not unique with respect to {1}"\
			
 
				+                            .format(field, grp_fields)
			
 
				 
			
 
				-                    self._log.error(err)
			
 
				-                    raise Exception(err)
			
 
				+                        self._log.error(err)
			
 
				+                        raise Exception(err)
			
 
				 
			
 
				-                if field not in grp_fields:
			
 
				-                    reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
			
 
				-                else:
			
 
				-                    reshaped_field =\
			
 
				-                        data[grp_fields].drop_duplicates()\
			
 
				-                        .set_index(grp_fields, drop=False)[field]
			
 
				+                    if field not in grp_fields:
			
 
				+                        reshaped_field = data.groupby(grp_fields, sort=False)[field].first()
			
 
				+                    else:
			
 
				+                        reshaped_field =\
			
 
				+                            data[grp_fields].drop_duplicates()\
			
 
				+                            .set_index(grp_fields, drop=False)[field]
			
 
				 
			
 
				-                reshaped_fields.append(reshaped_field)
			
 
				+                    reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				-            # if field is sub-document (dictionary)
			
 
				-            elif field_type == "object":
			
 
				+                # if field is sub-document (dictionary)
			
 
				+                elif field_type == "object":
			
 
				 
			
 
				-                sub_schema = deepcopy(schema["properties"][field])
			
 
				+                    sub_schema = deepcopy(schema["properties"][field])
			
 
				 
			
 
				-                # rename sub-schema properties to match with data column names
			
 
				-                sub_schema["properties"] =\
			
 
				-                    {".".join([field, k]): v for k, v
			
 
				-                     in sub_schema["properties"].items()}
			
 
				+                    # rename sub-schema properties to match with data column names
			
 
				+                    sub_schema["properties"] =\
			
 
				+                        {".".join([field, k]): v for k, v
			
 
				+                        in sub_schema["properties"].items()}
			
 
				 
			
 
				-                sub_data = self.to_list_of_documents(
			
 
				-                            data=data,
			
 
				-                            schema=sub_schema,
			
 
				-                            grp_fields=grp_fields,
			
 
				-                            _final_step=False)
			
 
				+                    sub_data = self.to_list_of_documents(
			
 
				+                                data=data,
			
 
				+                                schema=sub_schema,
			
 
				+                                grp_fields=grp_fields,
			
 
				+                                _final_step=False,
			
 
				+                                already_reshaped=already_reshaped)
			
 
				 
			
 
				-                # Need to be checked since child elements can be empty
			
 
				-                if sub_data is not None:
			
 
				+                    # Need to be checked since child elements can be empty
			
 
				+                    if sub_data is not None:
			
 
				 
			
 
				-                    reshaped_field = sub_data.apply(self._make_dict, axis=1)
			
 
				-                    reshaped_field.name = field
			
 
				+                        reshaped_field = sub_data.apply(self._make_dict, axis=1)
			
 
				+                        reshaped_field.name = field
			
 
				 
			
 
				-                    reshaped_fields.append(reshaped_field)
			
 
				+                        reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				-            # if field is a list of dictionaries
			
 
				-            elif field_type == "array":
			
 
				+                # if field is a list of dictionaries
			
 
				+                elif field_type == "array":
			
 
				 
			
 
				 
			
 
				-                items_type = schema["properties"][field]["items"]["bsonType"]
			
 
				+                    items_type = schema["properties"][field]["items"]["bsonType"]
			
 
				 
			
 
				-                if items_type == "object":
			
 
				-                    array_object = time.time()
			
 
				-                    sub_schema = deepcopy(schema["properties"][field]["items"])
			
 
				+                    if items_type == "object":
			
 
				+                        array_object = time.time()
			
 
				+                        sub_schema = deepcopy(schema["properties"][field]["items"])
			
 
				 
			
 
				-                    # rename sub-schema properties to match data column names
			
 
				-                    sub_schema["properties"] =\
			
 
				-                        {".".join([field, k]): v for k, v in
			
 
				-                         sub_schema["properties"].items()}
			
 
				+                        # rename sub-schema properties to match data column names
			
 
				+                        sub_schema["properties"] =\
			
 
				+                            {".".join([field, k]): v for k, v in
			
 
				+                            sub_schema["properties"].items()}
			
 
				 
			
 
				-                    # extend grp fields by sub-fields of field simple types
			
 
				-                    sub_grp_fields = [f for f in sub_schema["properties"]
			
 
				-                                      if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
			
 
				-                                      and (f in data.columns)]
			
 
				+                        # extend grp fields by sub-fields of field simple types
			
 
				+                        sub_grp_fields = [f for f in sub_schema["properties"]
			
 
				+                                        if (sub_schema["properties"][f]["bsonType"] not in ["array", "object"])
			
 
				+                                        and (f in data.columns)]
			
 
				 
			
 
				-                    if len(sub_grp_fields) == 0:
			
 
				-                        err = ("One of the sub-keys in a list of documents"
			
 
				-                               " must be of simple type for the field {}"
			
 
				-                               .format(field))
			
 
				+                        if len(sub_grp_fields) == 0:
			
 
				+                            err = ("One of the sub-keys in a list of documents"
			
 
				+                                " must be of simple type for the field {}"
			
 
				+                                .format(field))
			
 
				 
			
 
				-                        self._log.error(err)
			
 
				-                        raise Exception(err)
			
 
				+                            self._log.error(err)
			
 
				+                            raise Exception(err)
			
 
				 
			
 
				-                    # group and reshape sub-fields with complex types
			
 
				-                    sub_data = self.to_list_of_documents(
			
 
				-                                data=data,
			
 
				-                                schema=sub_schema,
			
 
				-                                grp_fields=grp_fields + sub_grp_fields,
			
 
				-                                _final_step=False)
			
 
				+                        # group and reshape sub-fields with complex types
			
 
				+                        sub_data = self.to_list_of_documents(
			
 
				+                                    data=data,
			
 
				+                                    schema=sub_schema,
			
 
				+                                    grp_fields=grp_fields + sub_grp_fields,
			
 
				+                                    _final_step=False,
			
 
				+                                    already_reshaped=already_reshaped)
			
 
				 
			
 
				-                    if sub_data is not None:
			
 
				+                        if sub_data is not None:
			
 
				 
			
 
				-                        # gether the results into a list of dictionaries
			
 
				-                        sub_data = sub_data.apply(self._make_dict, axis=1)
			
 
				+                            # gether the results into a list of dictionaries
			
 
				+                            sub_data = sub_data.apply(self._make_dict, axis=1)
			
 
				 
			
 
				-                        sub_data.name = field
			
 
				-                        sub_data = sub_data.reset_index(grp_fields)
			
 
				-                        ######################################################
			
 
				-                        ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
			
 
				-                        reshaped_field =\
			
 
				-                            sub_data.groupby(grp_fields, sort=False)[field]\
			
 
				-                                    .apply(self._make_list_of_distinct)
			
 
				-                        ######################################################
			
 
				-                        reshaped_fields.append(reshaped_field)
			
 
				+                            sub_data.name = field
			
 
				+                            sub_data = sub_data.reset_index(grp_fields)
			
 
				+                            ######################################################
			
 
				+                            ######## OPTIMIZATIONS MAY BE POSSIBLE HERE ##########
			
 
				+                            reshaped_field =\
			
 
				+                                sub_data.groupby(grp_fields, sort=False)[field]\
			
 
				+                                        .apply(self._make_list_of_distinct)
			
 
				+                            ######################################################
			
 
				+                            reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				 
			
 
				-                # if field is a list of values with simple type
			
 
				-                elif items_type == "array":
			
 
				-                    grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				+                    # if field is a list of values with simple type
			
 
				+                    elif items_type == "array":
			
 
				+                        grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				 
			
 
				-                    if field in data.columns:
			
 
				+                        if field in data.columns:
			
 
				 
			
 
				-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				-                                             .apply(self._make_list_of_distinct)
			
 
				+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				+                                                .apply(self._make_list_of_distinct)
			
 
				 
			
 
				-                        reshaped_fields.append(reshaped_field)
			
 
				-                else:
			
 
				+                            reshaped_fields.append(reshaped_field)
			
 
				+                    else:
			
 
				 
			
 
				-                    grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				+                        grp_fields = [c for c in grp_fields if c in data.columns]
			
 
				 
			
 
				-                    if field in data.columns:
			
 
				+                        if field in data.columns:
			
 
				 
			
 
				-                        reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				-                                             .apply(self._make_flattened_list_of_distinct)
			
 
				+                            reshaped_field = data.groupby(grp_fields, sort=False)[field]\
			
 
				+                                                .apply(self._make_flattened_list_of_distinct)
			
 
				 
			
 
				-                        reshaped_fields.append(reshaped_field)
			
 
				+                            reshaped_fields.append(reshaped_field)
			
 
				 
			
 
				         if len(reshaped_fields) > 0:
			
 
				 
			
--- a/cdplib/db_migration/MigrationCleaning.py
+++ b/cdplib/db_migration/MigrationCleaning.py
@@ -358,11 +358,11 @@ class MigrationCleaning:
 
				                 elif python_type == bool:
			
 
				 
			
 
				                     data[column] = data[column].str.lower()
			
 
				-                    accepted_bool = {'ja': True, 'j': True, '1': True,
			
 
				+                    accepted_bool = {'ja': True, 'j': True, '1': True, 1:True,
			
 
				                                      'yes': True, 'y': True, 'true':True,
			
 
				                                      't': True, 'nein': False, 'n': False,
			
 
				                                      'no': False, 'false': False, 'f': False,
			
 
				-                                     '0': False}
			
 
				+                                     '0': False, 0:False}
			
 
				                     data[column] = data[column].map(accepted_bool)
			
 
				                     data[column] = data[column].astype(bool)
			
 
				 
			
--- a/cdplib/db_migration/ParseJsonSchema.py
+++ b/cdplib/db_migration/ParseJsonSchema.py
@@ -51,15 +51,17 @@ class ParseJsonSchema(ParseDbSchema):
 
				                 with open(schema_path, "r") as f:
			
 
				                     schema = json.load(f)
			
 
				 
			
 
				-                ref_flag = self._analyze_schema(schema)
			
 
				-
			
 
				-                if ref_flag:
			
 
				-                    schema = self._format_schema_for_mongo(schema)
			
 
				+                definitions_flag = self._analyze_schema(schema)
			
 
				+                
			
 
				+                if definitions_flag:
			
 
				+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
			
 
				                     schema = self._dereference_schema(schema)
			
 
				-                    schema = self._format_schema_for_mongo(schema)
			
 
				+                    # Need to do it again since sub schema could also contain
			
 
				+                    # single quotes
			
 
				+                    schema = self._clean_desciptions_tags_from_single_quotes(schema)
			
 
				                     self.schemas.append(schema)
			
 
				+                    
			
 
				                 else:
			
 
				-                    schema = self._format_schema_for_mongo(schema)
			
 
				                     self.schemas.append(schema)
			
 
				 
			
 
				             except Exception as e:
			
@@ -199,7 +201,7 @@ class ParseJsonSchema(ParseDbSchema):
 
				                                  required_only=required_only)
			
 
				 
			
 
				         for schema in schemas[1:]:
			
 
				-
			
 
				+            
			
 
				             next_result = self._parse_one(schema=schema,
			
 
				                                           field_info=field_info,
			
 
				                                           required_only=required_only)
			
@@ -340,7 +342,7 @@ class ParseJsonSchema(ParseDbSchema):
 
				 
			
 
				         return already_parsed
			
 
				 
			
 
				-    def read_schema_and_parse_for_mongodb(self, schema_path: str) -> dict:
			
 
				+    def load_and_parse_schema_for_mongodb(self, schema_path: str) -> dict:
			
 
				         '''
			
 
				         We need to deference json before import to Mongo DB pymongo can't deal with references
			
 
				         :param str schema_path: path to the schema file.
			
@@ -353,9 +355,16 @@ class ParseJsonSchema(ParseDbSchema):
 
				             schema = json.load(json_file)
			
 
				 
			
 
				         definitions_flag = self._analyze_schema(schema)
			
 
				-
			
 
				+        
			
 
				         if definitions_flag:
			
 
				+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
			
 
				             schema = self._dereference_schema(schema)
			
 
				+             # Need to do it again since sub schema could also contain
			
 
				+             # single quotes
			
 
				+            schema = self._clean_desciptions_tags_from_single_quotes(schema)
			
 
				+            schema = self._format_schema_for_mongo(schema)
			
 
				+        else:
			
 
				+            schema = self._format_schema_for_mongo(schema)
			
 
				 
			
 
				         return schema
			
 
				 
			
@@ -371,12 +380,11 @@ class ParseJsonSchema(ParseDbSchema):
 
				                 definitions_flag = self._analyze_schema(schema[key], definitions_flag)
			
 
				 
			
 
				         return definitions_flag
			
 
				-
			
 
				-    def _format_schema_for_mongo(self, schema: dict) -> dict:
			
 
				+    
			
 
				+    
			
 
				+    def _clean_desciptions_tags_from_single_quotes(self, schema: dict) -> dict:
			
 
				         '''
			
 
				-        We use in the schema tags whih are not supported by mongo an threfore
			
 
				-        must be taken care of before setting the schema for mongo.
			
 
				-        :param str schema_path: path to the schema file.
			
 
				+        :param dict schema: dictonary containing schema
			
 
				         '''
			
 
				 
			
 
				         for key in list(schema):
			
@@ -386,14 +394,27 @@ class ParseJsonSchema(ParseDbSchema):
 
				                 schema[key] = cleaned_description
			
 
				 
			
 
				             if type(schema[key]) == dict:
			
 
				-                self._format_schema_for_mongo(schema[key])
			
 
				+                self._clean_desciptions_tags_from_single_quotes(schema[key])
			
 
				+                
			
 
				+        return schema
			
 
				 
			
 
				-            if key == 'examples':
			
 
				-                self._remove_examples(schema)
			
 
				+    def _format_schema_for_mongo(self, schema: dict) -> dict:
			
 
				+        '''
			
 
				+        We use in the schema tags whih are not supported by mongo an threfore
			
 
				+        must be taken care of before setting the schema for mongo.
			
 
				+        :param str schema_path: path to the schema file.
			
 
				+        '''
			
 
				 
			
 
				+        for key in list(schema):
			
 
				+
			
 
				+            if type(schema[key]) == dict:
			
 
				+                self._format_schema_for_mongo(schema[key])
			
 
				 
			
 
				             if key == 'default' or key == 'default_values':
			
 
				                 self._remove_defaults(schema)
			
 
				+                
			
 
				+            if key == 'examples':
			
 
				+                self._remove_examples(schema)
			
 
				 
			
 
				         return schema
			
 
				 
			
@@ -411,7 +432,7 @@ class ParseJsonSchema(ParseDbSchema):
 
				         schema = str(schema).replace("'", "\"")
			
 
				         schema = jsonref.loads(schema, base_uri=base_dir_url)
			
 
				         schema = deepcopy(schema)
			
 
				-        #schema.pop('definitions', None)
			
 
				+
			
 
				         return schema
			
 
				 
			
 
				     def _remove_defaults(self, schema: dict) -> dict:
			
--- a/cdplib/db_migration/ParseMapping.py
+++ b/cdplib/db_migration/ParseMapping.py
@@ -22,10 +22,10 @@ class ParseMapping:
 
				         import json
			
 
				         from cdplib.log import Log
			
 
				 
			
 
				-        self.log = Log('Parse Mapping')
			
 
				+        self._log = Log('Parse Mapping')
			
 
				 
			
 
				         if not os.path.isfile(mapping_path):
			
 
				-            err = "Mapping not found"
			
 
				+            err = "Mapping not found "+mapping_path
			
 
				             self._log.error(err)
			
 
				             raise FileNotFoundError(err)
			
 
				 
			
@@ -34,7 +34,7 @@ class ParseMapping:
 
				                 self._mapping = json.load(f)
			
 
				 
			
 
				         except Exception as e:
			
 
				-            err = ("Could not load mapping. "
			
 
				+            err = ("Could not load mapping. " + mapping_path +
			
 
				                    "Exit with error {}".format(e))
			
 
				             self._log.error(err)
			
 
				             raise Exception(err)
			
@@ -97,6 +97,42 @@ class ParseMapping:
 
				         '''
			
 
				         '''
			
 
				         return self._get_info(key="date_format")
			
 
				+    
			
 
				+    def get_internal_names(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+ 
			
 
				+        if all(["internal_name" in d for d in self._mapping]):
			
 
				+            internal_names = [d["internal_name"] for d in self._mapping]
			
 
				+    
			
 
				+        elif all(["internal_name" not in d for d in self._mapping]):
			
 
				+            internal_names = list(range(len(self._mapping)))
			
 
				+
			
 
				+
			
 
				+        else:
			
 
				+            err = ("Incorrectly filled mapping. Internal names should "
			
 
				+                   "either be in all or in neither of the fields")
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        return internal_names
			
 
				+
			
 
				+    def get_mongo_names(self) -> dict:
			
 
				+        '''
			
 
				+        '''
			
 
				+        if all(["mongo_name" in d for d in self._mapping]):
			
 
				+            mongo_names = [d["mongo_name"] for d in self._mapping]
			
 
				+
			
 
				+        elif all(["mongo_name" not in d for d in self._mapping]):
			
 
				+            mongo_names = list(range(len(self._mapping)))
			
 
				+
			
 
				+        else:
			
 
				+            err = ("Incorrectly filled mapping. Mongo names should "
			
 
				+                   "either be in all or in neither of the fields")
			
 
				+            self._log.error(err)
			
 
				+            raise Exception(err)
			
 
				+
			
 
				+        return mongo_names
			
 
				 
			
 
				     def get_types(self) -> dict:
			
 
				         '''
			
@@ -134,7 +170,7 @@ class ParseMapping:
 
				         else:
			
 
				             err = ("Incorrectly filled mapping. Column numbers should ",
			
 
				                    "either in all or in neither of the fields")
			
 
				-            self.log.err(err)
			
 
				+            self._log.err(err)
			
 
				             raise Exception(err)
			
 
				 
			
 
				         return column_numbers
			
--- a/cdplib/log.py
+++ b/cdplib/log.py
@@ -60,6 +60,44 @@ class Log():
 
				 
			
 
				         # self._logger.setLevel(log_level)
			
 
				 
			
 
				+
			
 
				+    @property
			
 
				+    def magenta(self):
			
 
				+        return '\033[95m'
			
 
				+
			
 
				+    @property
			
 
				+    def blue(self):
			
 
				+        return '\033[94m'
			
 
				+
			
 
				+    @property
			
 
				+    def cyan(self):
			
 
				+        return '\u001b[36m'
			
 
				+
			
 
				+    @property
			
 
				+    def green(self):
			
 
				+        return '\033[92m'
			
 
				+
			
 
				+    @property
			
 
				+    def yellow(self):
			
 
				+        return '\033[93m'
			
 
				+
			
 
				+    @property
			
 
				+    def fail(self):
			
 
				+        return '\033[91m'
			
 
				+
			
 
				+    @property
			
 
				+    def reset(self):
			
 
				+        return '\033[0m'
			
 
				+
			
 
				+    @property
			
 
				+    def bold(self):
			
 
				+        return '\033[1m'
			
 
				+
			
 
				+    @property
			
 
				+    def underline(self):
			
 
				+        return '\033[4m'
			
 
				+
			
 
				+
			
 
				     def info(self, message: str):
			
 
				         self._logger.info(message)
			
 
				 
			
--- a/cdplib/unit_tests/TestFlattenData.py
+++ b/cdplib/unit_tests/TestFlattenData.py
@@ -7,7 +7,7 @@ sys.path.append(os.getcwd())
 
				 from cdplib.log import Log
			
 
				 from cdplib.FlattenData import FlattenData
			
 
				 
			
 
				-class TestMongodbHandler(unittest.TestCase):
			
 
				+class TestFlattenData(unittest.TestCase):
			
 
				 
			
 
				     def setUp(self):
			
 
				         self.flattener = FlattenData()
			
--- a/cdplib/unit_tests/TestLog.py
+++ b/cdplib/unit_tests/TestLog.py
@@ -0,0 +1,26 @@
 
				+import unittest
			
 
				+import sys
			
 
				+import os
			
 
				+from pprint import pprint
			
 
				+sys.path.append(os.getcwd())
			
 
				+from cdplib.log import Log
			
 
				+
			
 
				+
			
 
				+class TestLog(unittest.TestCase):
			
 
				+
			
 
				+    def setUp(self):
			
 
				+       self._log = Log('Log Test')
			
 
				+
			
 
				+    def test_A_Log_Colors(self):
			
 
				+        self._log.info('Test Starts Here')
			
 
				+        self._log.info(self._log.magenta + "Header")
			
 
				+        self._log.info(self._log.blue + "Blue")
			
 
				+        self._log.info( self._log.green + "Green")
			
 
				+        self._log.info(self._log.yellow + "yellow")
			
 
				+        self._log.info(self._log.fail + "Fail")
			
 
				+        self._log.info(self._log.reset + "reset")
			
 
				+        self._log.info(self._log.bold + "bold" )
			
 
				+        self._log.info(self._log.underline + "underline" )
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    unittest.main()
			
--- a/cdplib/unit_tests/TestMongodbHandler.py
+++ b/cdplib/unit_tests/TestMongodbHandler.py
@@ -1,6 +1,7 @@
 
				 import unittest
			
 
				 import sys
			
 
				 import os
			
 
				+import time
			
 
				 from pymongo import MongoClient
			
 
				 sys.path.append(os.getcwd())
			
 
				 from cdplib.log import Log
			
@@ -25,12 +26,14 @@ class TestMongodbHandler(unittest.TestCase):
 
				         self.valid_input = {
			
 
				                         "test_value_string": "test_value",
			
 
				                         "test_value_double": 2.4,
			
 
				-                        "test_value_double_array": [1.4, 1.6, 3.5]
			
 
				+                        "test_value_double_array": [1.4, 1.6, 3.5],
			
 
				+                        "test_value_date": "2020-01-28T15:45:25.000Z"
			
 
				                         }
			
 
				         self.invalid_input = {
			
 
				                         "test_value_string": 1,
			
 
				                         "test_value_double": "Wrong value",
			
 
				-                        "test_value_double_array": [1.4, 1.6, 3.5]
			
 
				+                        "test_value_double_array": [1.4, 1.6, 3.5],
			
 
				+                        "test_value_date": "2019-01-28T15:45:25.000Z"
			
 
				                         }
			
 
				 
			
 
				 
			
@@ -81,9 +84,10 @@ class TestMongodbHandler(unittest.TestCase):
 
				         Fetch data and confirms thats it is the same as was entered into the database
			
 
				         Do the same with more specific query
			
 
				         '''
			
 
				+
			
 
				         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
			
 
				         self.assertEqual(self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name, 'test_value_string', 'test_value').to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
			
 
				-    
			
 
				+
			
 
				     def test_F_aggregate_data_and_generate_dataframe(self):
			
 
				         '''
			
 
				         Make an aggregation call
			
@@ -93,7 +97,7 @@ class TestMongodbHandler(unittest.TestCase):
 
				                                 { '$match': {}}
			
 
				                                 ]
			
 
				         self.assertEqual(self.mongodb_handler.aggregate_data_and_generate_dataframe(self.first_collection_name, aggregation_pipeline).to_dict()['test_value_double'][0], self.valid_input['test_value_double'])
			
 
				-    
			
 
				+
			
 
				     def test_G_update_data_in_collection(self):
			
 
				         '''
			
 
				         Fetch data from database
			
@@ -104,7 +108,7 @@ class TestMongodbHandler(unittest.TestCase):
 
				         '''
			
 
				         original_value = self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
			
 
				         self.assertEqual(original_value, self.valid_input['test_value_string'])
			
 
				-        self.mongodb_handler.update_data_in_collection('test_value_string', 'test_value', 'test_value_string', 'new_test_value', self.first_collection_name)
			
 
				+        self.mongodb_handler.update_data_in_collection('test_value_string', 'new_test_value', self.first_collection_name, 'test_value_string', 'test_value', create_if_not_exist=False)
			
 
				         new_value =  self.mongodb_handler.query_data_and_generate_dataframe(self.first_collection_name).to_dict()['test_value_string'][0]
			
 
				         self.assertNotEqual(original_value, new_value)
			
 
				 
			
@@ -116,6 +120,11 @@ class TestMongodbHandler(unittest.TestCase):
 
				         index = 'test_value_string'
			
 
				         self.mongodb_handler.create_index(self.first_collection_name, index)
			
 
				         self.assertTrue(index in list(self.database[self.first_collection_name].index_information().keys()))
			
 
				+
			
 
				+    def test_I_query_data_between_dates_and_generate_dataframe(self):
			
 
				+
			
 
				+            data = self.mongodb_handler.query_data_between_dates_and_generate_dataframe(self.first_collection_name, "test_value_date", "2020-01-27T15:45:25.000Z", "2020-01-29T15:45:25.000Z", index ='test_value_string')
			
 
				+            self.assertEqual(data['test_value_double'][0], self.valid_input['test_value_double'])
			
 
				     
			
 
				     def test_Y_drop_collection(self):
			
 
				         '''
			
--- a/hooks/README.txt
+++ b/hooks/README.txt
@@ -0,0 +1,13 @@
 
				+These files are GIT HOOKS.
			
 
				+
			
 
				+A git hook is a a script which is executed when a git command is run.
			
 
				+The hook in this folder is executed when commiting (pre-commit).
			
 
				+
			
 
				+pre-commit executes the unit tests before commiting the changes.
			
 
				+
			
 
				+ACHTUNG!
			
 
				+Changes will still be commited and pushed even if there are errors in the test (for now at least).
			
 
				+So please pay attention to the tests an make sure that they ran without any problems. If the test
			
 
				+failed, please fix the issue before pushing the changes!
			
 
				+
			
 
				+To use it please copy the files into your .git/hooks folder.
			
--- a/hooks/pre-commit
+++ b/hooks/pre-commit
@@ -0,0 +1,17 @@
 
				+#!/bin/bash
			
 
				+#
			
 
				+# Runs test pipeline before commiting
			
 
				+#
			
 
				+# To enable this hook, rename this file to "pre-commit".
			
 
				+echo
			
 
				+echo "Running unit tests"
			
 
				+echo
			
 
				+
			
 
				+python cdplib/unit_tests/TestFlattenData.py
			
 
				+python cdplib/unit_tests/TestLog.py
			
 
				+python cdplib/unit_tests/TestMongodbHandler.py
			
 
				+
			
 
				+echo 
			
 
				+echo
			
 
				+echo -e "\033 Unit tests have been run and your data commited, in case any of the tests failed, please correct this before pushing changes"
			
 
				+echo
			
--- a/setup.py
+++ b/setup.py
@@ -1,32 +1,17 @@
 
				 from setuptools import setup,find_packages
			
 
				 
			
 
				 INSTALL_REQUIRES = [
			
 
				-        'pycodestyle',
			
 
				-        'ipykernel',
			
 
				-  		'spyder-kernels==0.*',
			
 
				-        'cloudpickle',
			
 
				-        'openpyxl',
			
 
				-        'setuptools',
			
 
				-        'scipy',
			
 
				-        'matplotlib',
			
 
				-        'tsfresh',
			
 
				-        'hyperopt',
			
 
				-        'xgboost',
			
 
				-        'scikit-learn',
			
 
				-  		'pandas',
			
 
				-        'pandas-compat',
			
 
				-        'xmltodict',
			
 
				+        'pandas',
			
 
				         'sqlalchemy',
			
 
				         'sqlparse',
			
 
				         'pymysql',
			
 
				-        'xlrd',
			
 
				         'pymongo',
			
 
				-        'jsonref', 
			
 
				-        'faker',
			
 
				-        'xeger',
			
 
				+        'jsonref',
			
 
				         'simplejson',
			
 
				         'mysql',
			
 
				-        'sqlalchemy-utils',
			
 
				+        'sqlalchemy_utils',
			
 
				+        'sklearn',
			
 
				+        'hyperopt',
			
 
				 ]